hydroserverpy 1.3.0b3__py3-none-any.whl → 1.4.0b3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hydroserverpy might be problematic. Click here for more details.

Files changed (39) hide show
  1. hydroserverpy/__init__.py +0 -2
  2. hydroserverpy/api/models/etl/__init__.py +26 -0
  3. hydroserverpy/api/models/etl/data_source.py +107 -72
  4. hydroserverpy/api/models/etl/etl_configuration.py +224 -0
  5. hydroserverpy/api/models/etl/extractors/__init__.py +6 -0
  6. hydroserverpy/{etl → api/models/etl}/extractors/base.py +16 -19
  7. hydroserverpy/{etl → api/models/etl}/extractors/http_extractor.py +5 -3
  8. hydroserverpy/api/models/etl/extractors/local_file_extractor.py +20 -0
  9. hydroserverpy/api/models/etl/factories.py +23 -0
  10. hydroserverpy/api/models/etl/loaders/__init__.py +4 -0
  11. hydroserverpy/{etl → api/models/etl}/loaders/base.py +0 -2
  12. hydroserverpy/api/models/etl/loaders/hydroserver_loader.py +100 -0
  13. hydroserverpy/api/models/etl/schedule.py +16 -0
  14. hydroserverpy/api/models/etl/status.py +14 -0
  15. hydroserverpy/api/models/etl/transformers/__init__.py +5 -0
  16. hydroserverpy/api/models/etl/transformers/base.py +128 -0
  17. hydroserverpy/{etl → api/models/etl}/transformers/csv_transformer.py +24 -13
  18. hydroserverpy/{etl → api/models/etl}/transformers/json_transformer.py +7 -6
  19. hydroserverpy/api/services/etl/data_source.py +1 -4
  20. {hydroserverpy-1.3.0b3.dist-info → hydroserverpy-1.4.0b3.dist-info}/METADATA +1 -1
  21. {hydroserverpy-1.3.0b3.dist-info → hydroserverpy-1.4.0b3.dist-info}/RECORD +28 -29
  22. hydroserverpy/etl/__init__.py +0 -21
  23. hydroserverpy/etl/extractors/__init__.py +0 -0
  24. hydroserverpy/etl/extractors/local_file_extractor.py +0 -19
  25. hydroserverpy/etl/hydroserver_etl.py +0 -40
  26. hydroserverpy/etl/loaders/__init__.py +0 -0
  27. hydroserverpy/etl/loaders/hydroserver_loader.py +0 -71
  28. hydroserverpy/etl/transformers/__init__.py +0 -0
  29. hydroserverpy/etl/transformers/base.py +0 -64
  30. hydroserverpy/etl_csv/__init__.py +0 -0
  31. hydroserverpy/etl_csv/exceptions.py +0 -14
  32. hydroserverpy/etl_csv/hydroserver_etl_csv.py +0 -346
  33. /hydroserverpy/{etl → api/models/etl}/extractors/ftp_extractor.py +0 -0
  34. /hydroserverpy/{etl → api/models/etl}/timestamp_parser.py +0 -0
  35. /hydroserverpy/{etl → api/models/etl}/types.py +0 -0
  36. {hydroserverpy-1.3.0b3.dist-info → hydroserverpy-1.4.0b3.dist-info}/WHEEL +0 -0
  37. {hydroserverpy-1.3.0b3.dist-info → hydroserverpy-1.4.0b3.dist-info}/licenses/LICENSE +0 -0
  38. {hydroserverpy-1.3.0b3.dist-info → hydroserverpy-1.4.0b3.dist-info}/top_level.txt +0 -0
  39. {hydroserverpy-1.3.0b3.dist-info → hydroserverpy-1.4.0b3.dist-info}/zip-safe +0 -0
@@ -1,64 +0,0 @@
1
- from abc import ABC, abstractmethod
2
- import logging
3
- from typing import Union
4
- from hydroserverpy.etl.timestamp_parser import TimestampParser
5
- import pandas as pd
6
-
7
-
8
- class Transformer(ABC):
9
- def __init__(self, settings: object):
10
- self.timestamp = settings["timestamp"]
11
- self.timestamp_key: Union[str, int] = self.timestamp["key"]
12
-
13
- if isinstance(self.timestamp_key, int):
14
- # Users will always interact in 1-based, so if the key is a column index, convert to 0-based
15
- self.timestamp_key = self.timestamp_key - 1
16
-
17
- self.timestamp_parser = TimestampParser(self.timestamp)
18
-
19
- @abstractmethod
20
- def transform(self, *args, **kwargs) -> None:
21
- pass
22
-
23
- @property
24
- def needs_datastreams(self) -> bool:
25
- return False
26
-
27
- def standardize_dataframe(self, df: pd.DataFrame, payload_mappings):
28
- rename_map = {
29
- mapping["sourceIdentifier"]: mapping["targetIdentifier"]
30
- for mapping in payload_mappings
31
- }
32
-
33
- df.rename(
34
- columns={self.timestamp_key: "timestamp", **rename_map},
35
- inplace=True,
36
- )
37
-
38
- # Verify timestamp column is present in the DataFrame
39
- if "timestamp" not in df.columns:
40
- message = f"Timestamp column '{self.timestamp_key}' not found in data."
41
- logging.error(message)
42
- raise ValueError(message)
43
-
44
- # verify datastream columns
45
- expected = set(rename_map.values())
46
- missing = expected - set(df.columns)
47
- if missing:
48
- raise ValueError(
49
- "The following datastream IDs are specified in the config file but their related keys could not be "
50
- f"found in the source system's extracted data: {missing}"
51
- )
52
-
53
- # keep only timestamp + datastream columns; remove the rest inplace
54
- to_keep = ["timestamp", *expected]
55
- df.drop(columns=df.columns.difference(to_keep), inplace=True)
56
-
57
- df["timestamp"] = self.timestamp_parser.parse_series(df["timestamp"])
58
-
59
- df.drop_duplicates(subset=["timestamp"], keep="last")
60
- logging.info(f"standardized dataframe created: {df.shape}")
61
- logging.info(f"{df.info()}")
62
- logging.info(f"{df.head()}")
63
-
64
- return df
File without changes
@@ -1,14 +0,0 @@
1
- class HeaderParsingError(Exception):
2
- """
3
- Raised when the header of a CSV file cannot be parsed due to incorrect field names or out of range index values.
4
- """
5
-
6
- pass
7
-
8
-
9
- class TimestampParsingError(Exception):
10
- """
11
- Raised when the timestamp of a CSV file row cannot be parsed.
12
- """
13
-
14
- pass
@@ -1,346 +0,0 @@
1
- import csv
2
- import math
3
- import logging
4
- import croniter
5
- import pandas as pd
6
- from typing import IO, List, TYPE_CHECKING
7
- from requests import HTTPError
8
- from datetime import datetime, timezone, timedelta
9
- from dateutil.parser import isoparse
10
- from .exceptions import HeaderParsingError, TimestampParsingError
11
- import warnings
12
-
13
- if TYPE_CHECKING:
14
- from hydroserverpy.api.models import DataSource
15
-
16
- logger = logging.getLogger("hydroserver_etl")
17
- logger.addHandler(logging.NullHandler())
18
-
19
-
20
- class HydroServerETLCSV:
21
-
22
- def __init__(
23
- self,
24
- service,
25
- data_file: IO[str],
26
- data_source: "DataSource",
27
- ):
28
- warnings.warn(
29
- "HydroServerETLCSV is deprecated and will be removed in a future version. "
30
- "Please use the new HydroServerETL class.",
31
- DeprecationWarning,
32
- )
33
- self._service = service
34
- self._data_file = data_file
35
- self._data_source = data_source
36
- self._datastreams = {
37
- datastream.uid: datastream for datastream in data_source.datastreams
38
- }
39
-
40
- self._datastream_mapping = {
41
- mapping["targetIdentifier"]: mapping["sourceIdentifier"]
42
- for payload in self._data_source.settings["payloads"]
43
- for mapping in payload.get("mappings", [])
44
- }
45
-
46
- self._timestamp_column_index = None
47
- self._datastream_column_indexes = None
48
- self._datastream_start_row_indexes = {}
49
-
50
- self._message = None
51
- self._failed_datastreams = []
52
- self._file_header_error = False
53
- self._file_timestamp_error = False
54
-
55
- self._chunk_size = 1000
56
- self._observations = {}
57
-
58
- def run(self):
59
- """
60
- The run function is the main function of this class. It reads in a data file and parses it into observations,
61
- which are then posted to HydroServer. The run function also updates the DataSource object with information about
62
- the sync process.
63
-
64
- :param self
65
- :return: None
66
- """
67
-
68
- data_reader = csv.reader(
69
- self._data_file,
70
- delimiter=self._data_source.settings["transformer"]["delimiter"],
71
- )
72
-
73
- try:
74
- for i, row in enumerate(data_reader):
75
-
76
- # Parse through the data file to get header info and start reading observations.
77
- self._parse_data_file_row(i + 1, row)
78
-
79
- # Post chunked observations once chunk size has been reached.
80
- if i > 0 and i % self._chunk_size == 0:
81
- self._failed_datastreams.extend(self._post_observations())
82
-
83
- except HeaderParsingError as e:
84
- self._message = f"Failed to parse header for {self._data_source.name} with error: {str(e)}"
85
- logger.error(self._message)
86
- self._file_header_error = True
87
-
88
- except TimestampParsingError as e:
89
- self._message = f"Failed to parse one or more timestamps for {self._data_source.name} with error: {str(e)}"
90
- logger.error(self._message)
91
- self._file_timestamp_error = True
92
-
93
- # Post final chunk of observations after file has been fully parsed.
94
- self._failed_datastreams.extend(self._post_observations())
95
-
96
- if not self._message and len(self._failed_datastreams) > 0:
97
- self._message = f"One or more datastreams failed to sync with HydroServer for {self._data_source.name}."
98
-
99
- self._update_data_source()
100
-
101
- def _parse_data_file_row(self, index: int, row: List[str]) -> None:
102
- """
103
- The parse_data_file_row function is used to parse the data file row by row. The function takes in two
104
- arguments: index and row. The index argument is the current line number of the data file, and it's used to
105
- determine if we are at a header or not (if so, then we need to determine the column index for each named
106
- column). The second argument is a list containing all the values for each column on that particular line. If
107
- this isn't a header, then we check if there are any observations with timestamps later than the latest
108
- timestamp for the associated datastream; if so, then add them into our observation_bodies to be posted.
109
-
110
- :param self
111
- :param index: Keep track of the row number in the file
112
- :param row: Access the row of data in the csv file
113
- :return: A list of datetime and value pairs for each datastream
114
- """
115
-
116
- if index == self._data_source.settings["transformer"]["headerRow"] or (
117
- index == self._data_source.settings["transformer"]["dataStartRow"]
118
- and self._timestamp_column_index is None
119
- ):
120
- self._parse_file_header(row)
121
-
122
- if index < self._data_source.settings["transformer"]["dataStartRow"]:
123
- return
124
-
125
- timestamp = self._parse_row_timestamp(row)
126
-
127
- for datastream in self._datastreams.values():
128
- if index == self._data_source.settings["transformer"]["dataStartRow"]:
129
- datastream.sync_phenomenon_end_time()
130
-
131
- if str(datastream.uid) not in self._datastream_start_row_indexes.keys():
132
- if (
133
- not datastream.phenomenon_end_time
134
- or timestamp > datastream.phenomenon_end_time
135
- ):
136
- self._datastream_start_row_indexes[str(datastream.uid)] = index
137
-
138
- if (
139
- str(datastream.uid) in self._datastream_start_row_indexes.keys()
140
- and self._datastream_start_row_indexes[str(datastream.uid)] <= index
141
- ):
142
- if str(datastream.uid) not in self._observations.keys():
143
- self._observations[str(datastream.uid)] = []
144
-
145
- raw_result = row[
146
- self._datastream_column_indexes[
147
- self._datastream_mapping[str(datastream.uid)]
148
- ]
149
- ]
150
-
151
- if isinstance(raw_result, (int, float)):
152
- result = raw_result
153
- else:
154
- try:
155
- result = float(raw_result)
156
- except (TypeError, ValueError):
157
- result = datastream.no_data_value
158
-
159
- if math.isnan(result):
160
- result = datastream.no_data_value
161
-
162
- self._observations[str(datastream.uid)].append(
163
- {
164
- "phenomenon_time": timestamp,
165
- "result": result,
166
- }
167
- )
168
-
169
- def _parse_file_header(self, row: List[str]) -> None:
170
- """
171
- The _parse_file_header function is used to parse the header of a file.
172
- It takes in a row (a list of strings) and parses it for the timestamp column index,
173
- and datastream column indexes. It then sets these values as attributes on self._timestamp_column_index,
174
- and self._datastream_column_indexes respectively.
175
-
176
- :param self: Refer to the object itself
177
- :param row: List[str]: Parse the header of a csv file
178
- :return: A dictionary of the datastreams with their column index
179
- """
180
-
181
- try:
182
- timestamp_key = (self._data_source.settings["transformer"].get("timestampKey") or
183
- self._data_source.settings["transformer"]["timestamp"]["key"])
184
- self._timestamp_column_index = (
185
- row.index(timestamp_key)
186
- if isinstance(timestamp_key, str)
187
- else int(timestamp_key) - 1
188
- )
189
- if self._timestamp_column_index > len(row):
190
- raise ValueError
191
- self._datastream_column_indexes = {
192
- self._datastream_mapping[str(datastream.uid)]: (
193
- row.index(self._datastream_mapping[str(datastream.uid)])
194
- if not self._datastream_mapping[str(datastream.uid)].isdigit()
195
- else int(self._datastream_mapping[str(datastream.uid)]) - 1
196
- )
197
- for datastream in self._datastreams.values()
198
- }
199
- if len(self._datastream_column_indexes.values()) > 0 and max(
200
- self._datastream_column_indexes.values()
201
- ) > len(row):
202
- raise ValueError
203
- except ValueError as e:
204
- logger.error(
205
- f'Failed to load data from data source: "{self._data_source.name}"'
206
- )
207
- raise HeaderParsingError(str(e)) from e
208
-
209
- def _parse_row_timestamp(self, row: List[str]) -> datetime:
210
- """
211
- The _parse_row_timestamp function takes a row of data from the CSV file and parses it into a datetime object.
212
-
213
- :param self
214
- :param row: List[str]: Parse the timestamp from a row of data
215
- :return: A datetime object, which is a python standard library class
216
- """
217
-
218
- try:
219
- timestamp_format = (self._data_source.settings["transformer"].get("timestampFormat") or
220
- self._data_source.settings["transformer"].get("timestamp", {}).get("format"))
221
- if timestamp_format == "custom":
222
- timestamp = datetime.strptime(
223
- row[self._timestamp_column_index], timestamp_format,
224
- )
225
- else:
226
- timestamp = isoparse(row[self._timestamp_column_index])
227
- except ValueError as e:
228
- raise TimestampParsingError(str(e)) from e
229
-
230
- if timestamp.tzinfo is None:
231
- timestamp_offset = self._data_source.settings["transformer"].get(
232
- "timestampOffset"
233
- ) or self._data_source.settings["transformer"].get(
234
- "timestamp", {}
235
- ).get("offset")
236
- if not timestamp_offset or timestamp_offset.endswith(
237
- "0000"
238
- ):
239
- timestamp = timestamp.replace(tzinfo=timezone.utc)
240
- else:
241
- try:
242
- timestamp = timestamp.replace(
243
- tzinfo=datetime.strptime(
244
- timestamp_offset[:-2]
245
- + ":"
246
- + timestamp_offset[3:],
247
- "%z",
248
- ).tzinfo
249
- )
250
- except ValueError as e:
251
- logger.error(
252
- f'Failed to load data from data source: "{self._data_source.name}"'
253
- )
254
- raise TimestampParsingError(str(e)) from e
255
-
256
- return timestamp
257
-
258
- def _post_observations(self) -> List[str]:
259
- """
260
- The _post_observations function is used to post observations to the SensorThings API.
261
- The function returns a list of datastreams that failed to be posted.
262
- The function iterates through all datastreams in self._observations, which is a dictionary with keys being
263
- datastream IDs and values being lists of observation dictionaries (see _load_observations for more details).
264
- For each datastream, if it has not previously failed posting observations or if there are any new observations
265
- to post, the function posts the new observations to HydroServer using the SensorThings API.
266
-
267
- :param self
268
- :return: A list of failed datastreams
269
- """
270
-
271
- failed_datastreams = []
272
-
273
- for datastream_id, observations in self._observations.items():
274
- if datastream_id not in self._failed_datastreams and len(observations) > 0:
275
-
276
- logger.info(
277
- f"Loading observations from "
278
- + f'{observations[0]["phenomenon_time"].strftime("%Y-%m-%dT%H:%M:%S%z")} to '
279
- + f'{observations[-1]["phenomenon_time"].strftime("%Y-%m-%dT%H:%M:%S%z")} for datastream: '
280
- + f'{str(datastream_id)} in data source "{self._data_source.name}".'
281
- )
282
-
283
- observations_df = pd.DataFrame(
284
- [
285
- [observation["phenomenon_time"], observation["result"]]
286
- for observation in observations
287
- ],
288
- columns=["phenomenon_time", "result"],
289
- )
290
-
291
- try:
292
- self._service.datastreams.load_observations(
293
- uid=datastream_id,
294
- observations=observations_df,
295
- )
296
- except HTTPError as e:
297
- failed_datastreams.append(datastream_id)
298
- logger.error(f"Failed to POST observations to datastream: {str(datastream_id)} - {e}")
299
-
300
- elif datastream_id in self._failed_datastreams:
301
- logger.info(
302
- f"Skipping observations POST request from "
303
- + f'{observations[0]["phenomenon_time"].strftime("%Y-%m-%dT%H:%M:%S%z")} to '
304
- + f'{observations[-1]["phenomenon_time"].strftime("%Y-%m-%dT%H:%M:%S%z")} for datastream: '
305
- + f'{str(datastream_id)} in data source "{self._data_source.name}",'
306
- + f"due to previous failed POST request."
307
- )
308
-
309
- self._observations = {}
310
-
311
- return failed_datastreams
312
-
313
- def _update_data_source(self):
314
- """
315
- The _update_data_source function updates the data source with information about the last sync.
316
-
317
- :param self
318
- :return: None
319
- """
320
-
321
- if self._data_source.crontab is not None:
322
- next_run = croniter.croniter(
323
- self._data_source.crontab, datetime.now(timezone.utc)
324
- ).get_next(datetime)
325
- elif (
326
- self._data_source.interval is not None
327
- and self._data_source.interval_units is not None
328
- ):
329
- next_run = datetime.now(timezone.utc) + timedelta(
330
- **{self._data_source.interval_units: self._data_source.interval}
331
- )
332
- else:
333
- next_run = None
334
-
335
- self._data_source.last_run_successful = (
336
- True
337
- if not self._file_timestamp_error
338
- and not self._file_header_error
339
- and len(self._failed_datastreams) == 0
340
- else False
341
- )
342
- self._data_source.last_run_message = self._message
343
- self._data_source.last_run = datetime.now(timezone.utc)
344
- self._data_source.next_run = next_run
345
-
346
- self._data_source.save()
File without changes