hydroserverpy 1.1.1__tar.gz → 1.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hydroserverpy might be problematic. Click here for more details.

Files changed (78) hide show
  1. {hydroserverpy-1.1.1/src/hydroserverpy.egg-info → hydroserverpy-1.2.0}/PKG-INFO +1 -1
  2. {hydroserverpy-1.1.1 → hydroserverpy-1.2.0}/setup.cfg +1 -1
  3. {hydroserverpy-1.1.1 → hydroserverpy-1.2.0}/src/hydroserverpy/api/models/etl/data_source.py +2 -2
  4. {hydroserverpy-1.1.1 → hydroserverpy-1.2.0}/src/hydroserverpy/api/services/iam/workspace.py +1 -1
  5. {hydroserverpy-1.1.1 → hydroserverpy-1.2.0}/src/hydroserverpy/api/services/sta/datastream.py +4 -4
  6. hydroserverpy-1.2.0/src/hydroserverpy/etl/extractors/base.py +57 -0
  7. hydroserverpy-1.2.0/src/hydroserverpy/etl/extractors/http_extractor.py +29 -0
  8. hydroserverpy-1.2.0/src/hydroserverpy/etl/extractors/local_file_extractor.py +19 -0
  9. {hydroserverpy-1.1.1 → hydroserverpy-1.2.0}/src/hydroserverpy/etl/loaders/base.py +1 -1
  10. hydroserverpy-1.2.0/src/hydroserverpy/etl/loaders/hydroserver_loader.py +70 -0
  11. hydroserverpy-1.2.0/src/hydroserverpy/etl/timestamp_parser.py +75 -0
  12. {hydroserverpy-1.1.1 → hydroserverpy-1.2.0}/src/hydroserverpy/etl/transformers/base.py +6 -55
  13. {hydroserverpy-1.1.1 → hydroserverpy-1.2.0/src/hydroserverpy.egg-info}/PKG-INFO +1 -1
  14. {hydroserverpy-1.1.1 → hydroserverpy-1.2.0}/src/hydroserverpy.egg-info/SOURCES.txt +1 -0
  15. hydroserverpy-1.1.1/src/hydroserverpy/etl/extractors/base.py +0 -13
  16. hydroserverpy-1.1.1/src/hydroserverpy/etl/extractors/http_extractor.py +0 -99
  17. hydroserverpy-1.1.1/src/hydroserverpy/etl/extractors/local_file_extractor.py +0 -29
  18. hydroserverpy-1.1.1/src/hydroserverpy/etl/loaders/hydroserver_loader.py +0 -91
  19. {hydroserverpy-1.1.1 → hydroserverpy-1.2.0}/LICENSE +0 -0
  20. {hydroserverpy-1.1.1 → hydroserverpy-1.2.0}/README.md +0 -0
  21. {hydroserverpy-1.1.1 → hydroserverpy-1.2.0}/pyproject.toml +0 -0
  22. {hydroserverpy-1.1.1 → hydroserverpy-1.2.0}/setup.py +0 -0
  23. {hydroserverpy-1.1.1 → hydroserverpy-1.2.0}/src/hydroserverpy/__init__.py +0 -0
  24. {hydroserverpy-1.1.1 → hydroserverpy-1.2.0}/src/hydroserverpy/api/__init__.py +0 -0
  25. {hydroserverpy-1.1.1 → hydroserverpy-1.2.0}/src/hydroserverpy/api/http.py +0 -0
  26. {hydroserverpy-1.1.1 → hydroserverpy-1.2.0}/src/hydroserverpy/api/main.py +0 -0
  27. {hydroserverpy-1.1.1 → hydroserverpy-1.2.0}/src/hydroserverpy/api/models/__init__.py +0 -0
  28. {hydroserverpy-1.1.1 → hydroserverpy-1.2.0}/src/hydroserverpy/api/models/base.py +0 -0
  29. {hydroserverpy-1.1.1 → hydroserverpy-1.2.0}/src/hydroserverpy/api/models/etl/__init__.py +0 -0
  30. {hydroserverpy-1.1.1 → hydroserverpy-1.2.0}/src/hydroserverpy/api/models/etl/data_archive.py +0 -0
  31. {hydroserverpy-1.1.1 → hydroserverpy-1.2.0}/src/hydroserverpy/api/models/etl/orchestration_configuration.py +0 -0
  32. {hydroserverpy-1.1.1 → hydroserverpy-1.2.0}/src/hydroserverpy/api/models/etl/orchestration_system.py +0 -0
  33. {hydroserverpy-1.1.1 → hydroserverpy-1.2.0}/src/hydroserverpy/api/models/iam/__init__.py +0 -0
  34. {hydroserverpy-1.1.1 → hydroserverpy-1.2.0}/src/hydroserverpy/api/models/iam/account.py +0 -0
  35. {hydroserverpy-1.1.1 → hydroserverpy-1.2.0}/src/hydroserverpy/api/models/iam/apikey.py +0 -0
  36. {hydroserverpy-1.1.1 → hydroserverpy-1.2.0}/src/hydroserverpy/api/models/iam/collaborator.py +0 -0
  37. {hydroserverpy-1.1.1 → hydroserverpy-1.2.0}/src/hydroserverpy/api/models/iam/role.py +0 -0
  38. {hydroserverpy-1.1.1 → hydroserverpy-1.2.0}/src/hydroserverpy/api/models/iam/workspace.py +0 -0
  39. {hydroserverpy-1.1.1 → hydroserverpy-1.2.0}/src/hydroserverpy/api/models/sta/__init__.py +0 -0
  40. {hydroserverpy-1.1.1 → hydroserverpy-1.2.0}/src/hydroserverpy/api/models/sta/datastream.py +0 -0
  41. {hydroserverpy-1.1.1 → hydroserverpy-1.2.0}/src/hydroserverpy/api/models/sta/observed_property.py +0 -0
  42. {hydroserverpy-1.1.1 → hydroserverpy-1.2.0}/src/hydroserverpy/api/models/sta/processing_level.py +0 -0
  43. {hydroserverpy-1.1.1 → hydroserverpy-1.2.0}/src/hydroserverpy/api/models/sta/result_qualifier.py +0 -0
  44. {hydroserverpy-1.1.1 → hydroserverpy-1.2.0}/src/hydroserverpy/api/models/sta/sensor.py +0 -0
  45. {hydroserverpy-1.1.1 → hydroserverpy-1.2.0}/src/hydroserverpy/api/models/sta/thing.py +0 -0
  46. {hydroserverpy-1.1.1 → hydroserverpy-1.2.0}/src/hydroserverpy/api/models/sta/unit.py +0 -0
  47. {hydroserverpy-1.1.1 → hydroserverpy-1.2.0}/src/hydroserverpy/api/services/__init__.py +0 -0
  48. {hydroserverpy-1.1.1 → hydroserverpy-1.2.0}/src/hydroserverpy/api/services/base.py +0 -0
  49. {hydroserverpy-1.1.1 → hydroserverpy-1.2.0}/src/hydroserverpy/api/services/etl/__init__.py +0 -0
  50. {hydroserverpy-1.1.1 → hydroserverpy-1.2.0}/src/hydroserverpy/api/services/etl/data_archive.py +0 -0
  51. {hydroserverpy-1.1.1 → hydroserverpy-1.2.0}/src/hydroserverpy/api/services/etl/data_source.py +0 -0
  52. {hydroserverpy-1.1.1 → hydroserverpy-1.2.0}/src/hydroserverpy/api/services/etl/orchestration_system.py +0 -0
  53. {hydroserverpy-1.1.1 → hydroserverpy-1.2.0}/src/hydroserverpy/api/services/iam/__init__.py +0 -0
  54. {hydroserverpy-1.1.1 → hydroserverpy-1.2.0}/src/hydroserverpy/api/services/sta/__init__.py +0 -0
  55. {hydroserverpy-1.1.1 → hydroserverpy-1.2.0}/src/hydroserverpy/api/services/sta/observed_property.py +0 -0
  56. {hydroserverpy-1.1.1 → hydroserverpy-1.2.0}/src/hydroserverpy/api/services/sta/processing_level.py +0 -0
  57. {hydroserverpy-1.1.1 → hydroserverpy-1.2.0}/src/hydroserverpy/api/services/sta/result_qualifier.py +0 -0
  58. {hydroserverpy-1.1.1 → hydroserverpy-1.2.0}/src/hydroserverpy/api/services/sta/sensor.py +0 -0
  59. {hydroserverpy-1.1.1 → hydroserverpy-1.2.0}/src/hydroserverpy/api/services/sta/thing.py +0 -0
  60. {hydroserverpy-1.1.1 → hydroserverpy-1.2.0}/src/hydroserverpy/api/services/sta/unit.py +0 -0
  61. {hydroserverpy-1.1.1 → hydroserverpy-1.2.0}/src/hydroserverpy/etl/__init__.py +0 -0
  62. {hydroserverpy-1.1.1 → hydroserverpy-1.2.0}/src/hydroserverpy/etl/extractors/__init__.py +0 -0
  63. {hydroserverpy-1.1.1 → hydroserverpy-1.2.0}/src/hydroserverpy/etl/extractors/ftp_extractor.py +0 -0
  64. {hydroserverpy-1.1.1 → hydroserverpy-1.2.0}/src/hydroserverpy/etl/hydroserver_etl.py +0 -0
  65. {hydroserverpy-1.1.1 → hydroserverpy-1.2.0}/src/hydroserverpy/etl/loaders/__init__.py +0 -0
  66. {hydroserverpy-1.1.1 → hydroserverpy-1.2.0}/src/hydroserverpy/etl/transformers/__init__.py +0 -0
  67. {hydroserverpy-1.1.1 → hydroserverpy-1.2.0}/src/hydroserverpy/etl/transformers/csv_transformer.py +0 -0
  68. {hydroserverpy-1.1.1 → hydroserverpy-1.2.0}/src/hydroserverpy/etl/transformers/json_transformer.py +0 -0
  69. {hydroserverpy-1.1.1 → hydroserverpy-1.2.0}/src/hydroserverpy/etl/types.py +0 -0
  70. {hydroserverpy-1.1.1 → hydroserverpy-1.2.0}/src/hydroserverpy/etl_csv/__init__.py +0 -0
  71. {hydroserverpy-1.1.1 → hydroserverpy-1.2.0}/src/hydroserverpy/etl_csv/exceptions.py +0 -0
  72. {hydroserverpy-1.1.1 → hydroserverpy-1.2.0}/src/hydroserverpy/etl_csv/hydroserver_etl_csv.py +0 -0
  73. {hydroserverpy-1.1.1 → hydroserverpy-1.2.0}/src/hydroserverpy/quality/__init__.py +0 -0
  74. {hydroserverpy-1.1.1 → hydroserverpy-1.2.0}/src/hydroserverpy/quality/service.py +0 -0
  75. {hydroserverpy-1.1.1 → hydroserverpy-1.2.0}/src/hydroserverpy.egg-info/dependency_links.txt +0 -0
  76. {hydroserverpy-1.1.1 → hydroserverpy-1.2.0}/src/hydroserverpy.egg-info/requires.txt +0 -0
  77. {hydroserverpy-1.1.1 → hydroserverpy-1.2.0}/src/hydroserverpy.egg-info/top_level.txt +0 -0
  78. {hydroserverpy-1.1.1 → hydroserverpy-1.2.0}/src/hydroserverpy.egg-info/zip-safe +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: hydroserverpy
3
- Version: 1.1.1
3
+ Version: 1.2.0
4
4
  Requires-Python: <4,>=3.9
5
5
  License-File: LICENSE
6
6
  Requires-Dist: requests>=2
@@ -1,6 +1,6 @@
1
1
  [metadata]
2
2
  name = hydroserverpy
3
- version = 1.1.1
3
+ version = 1.2.0
4
4
 
5
5
  [options]
6
6
  package_dir =
@@ -129,7 +129,7 @@ class DataSource(HydroServerModel, DataSourceFields, OrchestrationConfigurationF
129
129
  return
130
130
 
131
131
  if self.settings["extractor"]["type"] == "local":
132
- with open(self.settings["extractor"]["path"]) as data_file:
132
+ with open(self.settings["extractor"]["sourceUri"]) as data_file:
133
133
  loader = HydroServerETLCSV(
134
134
  self._connection, data_file=data_file, data_source=self
135
135
  )
@@ -137,7 +137,7 @@ class DataSource(HydroServerModel, DataSourceFields, OrchestrationConfigurationF
137
137
  elif self.settings["extractor"]["type"] == "HTTP":
138
138
  with tempfile.NamedTemporaryFile(mode="w+") as temp_file:
139
139
  response = requests.get(
140
- self.settings["extractor"]["urlTemplate"],
140
+ self.settings["extractor"]["sourceUri"],
141
141
  stream=True,
142
142
  timeout=60,
143
143
  )
@@ -178,7 +178,7 @@ class WorkspaceService(EndpointService):
178
178
  None,
179
179
  ...,
180
180
  )
181
- else None
181
+ else expires_at
182
182
  )
183
183
  }
184
184
  headers = {"Content-type": "application/json"}
@@ -195,7 +195,7 @@ class DatastreamService(SensorThingsService):
195
195
  None,
196
196
  ...,
197
197
  )
198
- else None
198
+ else phenomenon_begin_time
199
199
  ),
200
200
  "phenomenonEndTime": (
201
201
  phenomenon_end_time.isoformat()
@@ -204,7 +204,7 @@ class DatastreamService(SensorThingsService):
204
204
  None,
205
205
  ...,
206
206
  )
207
- else None
207
+ else phenomenon_end_time
208
208
  ),
209
209
  "resultBeginTime": (
210
210
  result_begin_time.isoformat()
@@ -213,7 +213,7 @@ class DatastreamService(SensorThingsService):
213
213
  None,
214
214
  ...,
215
215
  )
216
- else None
216
+ else result_begin_time
217
217
  ),
218
218
  "resultEndTime": (
219
219
  result_end_time.isoformat()
@@ -222,7 +222,7 @@ class DatastreamService(SensorThingsService):
222
222
  None,
223
223
  ...,
224
224
  )
225
- else None
225
+ else result_end_time
226
226
  ),
227
227
  "isPrivate": is_private,
228
228
  "isVisible": is_visible,
@@ -0,0 +1,57 @@
1
+ from abc import abstractmethod
2
+ import logging
3
+ import pandas as pd
4
+ from datetime import datetime
5
+
6
+ from src.hydroserverpy.etl.timestamp_parser import TimestampParser
7
+
8
+
9
+ class Extractor:
10
+ def __init__(self, settings: dict):
11
+ self.settings = settings
12
+ self.source_uri = settings["sourceUri"]
13
+
14
+ def resolve_placeholder_variables(self, payload, loader):
15
+ logging.info(f"Creating runtime variables...")
16
+ filled = {}
17
+ for var in self.settings.get("placeholderVariables", []):
18
+ name = var["name"]
19
+ var_type = var.get("type", None)
20
+
21
+ if var_type == "runTime":
22
+ logging.info(f"Resolving runtime var: {name}")
23
+ if var.get("runTimeValue", None) == "latestObservationTimestamp":
24
+ value = loader.earliest_begin_date(payload)
25
+ elif var.get("runTimeValue", None) == "jobExecutionTime":
26
+ value = pd.Timestamp.now(tz="UTC")
27
+ elif var_type == "perPayload":
28
+ logging.info(f"Resolving payload var: {name}")
29
+ payload_vars = payload.get("extractorVariables", {})
30
+ if name not in payload_vars:
31
+ raise KeyError(f"Missing per-payload variable '{name}'")
32
+ value = payload_vars[name]
33
+ else:
34
+ continue
35
+
36
+ if isinstance(value, (datetime, pd.Timestamp)):
37
+ fmt = var.get("timestampFormat", "ISO8601")
38
+ offset = var.get("timestampOffset", "+0000")
39
+ parser = TimestampParser(fmt, offset)
40
+ value = parser.format(value)
41
+
42
+ filled[name] = value
43
+ if not filled:
44
+ return self.source_uri
45
+ return self.format_uri(filled)
46
+
47
+ def format_uri(self, placeholder_variables):
48
+ try:
49
+ uri = self.source_uri.format(**placeholder_variables)
50
+ except KeyError as e:
51
+ missing_key = e.args[0]
52
+ raise KeyError(f"Missing placeholder variable: {missing_key}")
53
+ return uri
54
+
55
+ @abstractmethod
56
+ def extract(self):
57
+ pass
@@ -0,0 +1,29 @@
1
+ import logging
2
+ import requests
3
+ from io import BytesIO
4
+ from .base import Extractor
5
+
6
+
7
+ class HTTPExtractor(Extractor):
8
+ def __init__(self, settings: dict):
9
+ super().__init__(settings)
10
+
11
+ def extract(self, payload, loader=None):
12
+ """
13
+ Downloads the file from the HTTP/HTTPS server and returns a file-like object.
14
+ """
15
+ url = self.resolve_placeholder_variables(payload, loader)
16
+ logging.info(f"Requesting data from → {url}")
17
+
18
+ try:
19
+ response = requests.get(url)
20
+ except Exception as e:
21
+ logging.error(f"Failed to fetch {url}: {e}")
22
+ raise
23
+
24
+ data = BytesIO()
25
+ for chunk in response.iter_content(chunk_size=8192):
26
+ if chunk:
27
+ data.write(chunk)
28
+ data.seek(0)
29
+ return data
@@ -0,0 +1,19 @@
1
+ import logging
2
+ from .base import Extractor
3
+
4
+
5
+ class LocalFileExtractor(Extractor):
6
+ def __init__(self, settings: object):
7
+ super().__init__(settings)
8
+
9
+ def extract(self):
10
+ """
11
+ Opens the file and returns a file-like object.
12
+ """
13
+ try:
14
+ file_handle = open(self.source_uri, "r")
15
+ logging.info(f"Successfully opened file '{self.source_uri}'.")
16
+ return file_handle
17
+ except Exception as e:
18
+ logging.error(f"Error opening file '{self.source_uri}': {e}")
19
+ return None
@@ -9,5 +9,5 @@ class Loader(ABC):
9
9
  pass
10
10
 
11
11
  @abstractmethod
12
- def get_data_requirements(self, df: pd.DataFrame) -> Dict[str, pd.Timestamp]:
12
+ def earliest_begin_date(self, payload_mappings) -> str:
13
13
  pass
@@ -0,0 +1,70 @@
1
+ from hydroserverpy import HydroServer
2
+ from typing import Optional
3
+
4
+ from .base import Loader
5
+ import logging
6
+ import pandas as pd
7
+
8
+
9
+ class HydroServerLoader(HydroServer, Loader):
10
+ """
11
+ A class that extends the HydroServer client with ETL-specific functionalities.
12
+ """
13
+
14
+ def __init__(
15
+ self,
16
+ host: str,
17
+ email: Optional[str] = None,
18
+ password: Optional[str] = None,
19
+ apikey: Optional[str] = None,
20
+ ):
21
+ super().__init__(
22
+ host=host,
23
+ email=email,
24
+ password=password,
25
+ apikey=apikey,
26
+ )
27
+ self._begin_cache: dict[str, str] = {}
28
+
29
+ def load(self, data: pd.DataFrame, payload) -> None:
30
+ """
31
+ Load observations from a DataFrame to the HydroServer.
32
+ :param data: A Pandas DataFrame where each column corresponds to a datastream.
33
+ """
34
+ begin_date = self.earliest_begin_date(payload)
35
+ new_data = data[data["timestamp"] > begin_date]
36
+ for col in new_data.columns.difference(["timestamp"]):
37
+ df = (
38
+ new_data[["timestamp", col]]
39
+ .rename(columns={col: "value"})
40
+ .dropna(subset=["value"])
41
+ )
42
+ if df.empty:
43
+ logging.warning(f"No new data for {col}, skipping.")
44
+ continue
45
+ logging.info(f"loading dataframe {df}")
46
+ logging.info(f"dtypes: {df.dtypes}")
47
+
48
+ df["value"] = pd.to_numeric(df["value"], errors="raise")
49
+ self.datastreams.load_observations(uid=col, observations=df)
50
+
51
+ def _fetch_earliest_begin(self, mappings: list[dict]) -> pd.Timestamp:
52
+ timestamps = []
53
+ for m in mappings:
54
+ ds = self.datastreams.get(uid=m["targetIdentifier"])
55
+ if not ds:
56
+ raise RuntimeError(f"Datastream {m['targetIdentifier']} not found.")
57
+ raw = ds.phenomenon_end_time or "1970-01-01"
58
+ ts = pd.to_datetime(raw, utc=True)
59
+ logging.info(f"timestamp {ts}")
60
+ timestamps.append(ts)
61
+ return min(timestamps)
62
+
63
+ def earliest_begin_date(self, payload: dict) -> pd.Timestamp:
64
+ """
65
+ Return earliest begin date for a payload, or compute+cache it on first call.
66
+ """
67
+ key = payload["name"]
68
+ if key not in self._begin_cache:
69
+ self._begin_cache[key] = self._fetch_earliest_begin(payload["mappings"])
70
+ return self._begin_cache[key]
@@ -0,0 +1,75 @@
1
+ import logging
2
+ from datetime import datetime, timedelta, timezone
3
+ from typing import Union
4
+ import pandas as pd
5
+
6
+
7
+ class TimestampParser:
8
+ def __init__(
9
+ self, timestamp_format: str = "ISO8601", timestamp_offset: str = "+0000"
10
+ ):
11
+ VALID_KEYS = {"utc", "iso8601", "constant"}
12
+ self.timestamp_offset = timestamp_offset
13
+ self.timestamp_format = timestamp_format
14
+
15
+ if (
16
+ self.timestamp_format.lower() not in VALID_KEYS
17
+ and "%" not in self.timestamp_format
18
+ ):
19
+ raise ValueError(
20
+ f"timestamp_format must be one of {', '.join(VALID_KEYS)} "
21
+ "or a valid strftime pattern."
22
+ )
23
+
24
+ def parse_series(self, raw_series: pd.Series) -> pd.Series:
25
+ s = raw_series.str.strip()
26
+ if self.timestamp_format.lower() == "utc":
27
+ parsed = pd.to_datetime(s, utc=True, errors="coerce")
28
+
29
+ elif self.timestamp_format.lower() == "iso8601":
30
+ parsed = pd.to_datetime(s, errors="coerce").dt.tz_convert("UTC")
31
+
32
+ elif self.timestamp_format.lower() == "constant":
33
+ off = self.timestamp_offset.strip()
34
+ if not (len(off) == 5 and off[0] in "+-"):
35
+ raise ValueError(f"Invalid timestamp_offset: {off}")
36
+ sign = 1 if off[0] == "+" else -1
37
+ hrs, mins = int(off[1:3]), int(off[3:5])
38
+ tz = timezone(timedelta(minutes=sign * (hrs * 60 + mins)))
39
+ naive = pd.to_datetime(s, errors="coerce")
40
+ parsed = naive.dt.tz_localize(tz).dt.tz_convert("UTC")
41
+
42
+ else:
43
+ parsed = pd.to_datetime(
44
+ s, format=self.timestamp_format, errors="coerce"
45
+ ).dt.tz_localize("UTC")
46
+
47
+ if parsed.isna().any():
48
+ bad_rows = s[parsed.isna()].head(5).tolist()
49
+ logging.warning(
50
+ f"{parsed.isna().sum()} timestamps failed to parse. "
51
+ f"Sample bad values: {bad_rows}"
52
+ )
53
+
54
+ return parsed
55
+
56
+ def format(self, dt: Union[datetime, pd.Timestamp]) -> str:
57
+ if isinstance(dt, pd.Timestamp):
58
+ dt = dt.to_pydatetime()
59
+
60
+ fmt = self.timestamp_format.lower()
61
+ if fmt == "utc":
62
+ return dt.astimezone(timezone.utc).strftime("%Y-%m-%dT%H:%M:%S")
63
+
64
+ if fmt == "iso8601":
65
+ return dt.astimezone(timezone.utc).isoformat()
66
+
67
+ if fmt == "constant":
68
+ off = self.timestamp_offset.strip()
69
+ sign = 1 if off[0] == "+" else -1
70
+ hrs, mins = int(off[1:3]), int(off[3:5])
71
+ tz = timezone(timedelta(minutes=sign * (hrs * 60 + mins)))
72
+ return dt.astimezone(tz).strftime("%Y-%m-%dT%H:%M:%S")
73
+
74
+ # custom strftime
75
+ return dt.strftime(self.timestamp_format)
@@ -1,8 +1,7 @@
1
1
  from abc import ABC, abstractmethod
2
- from datetime import timedelta, timezone
3
2
  import logging
4
3
  from typing import Union
5
- import pandas as pd
4
+ from src.hydroserverpy.etl.timestamp_parser import TimestampParser
6
5
 
7
6
 
8
7
  class Transformer(ABC):
@@ -17,6 +16,10 @@ class Transformer(ABC):
17
16
  # Users will always interact in 1-based, so if the key is a column index, convert to 0-based
18
17
  self.timestamp_key = self.timestamp_key - 1
19
18
 
19
+ self.timestamp_parser = TimestampParser(
20
+ self.timestamp_format, self.timestamp_offset
21
+ )
22
+
20
23
  @abstractmethod
21
24
  def transform(self, *args, **kwargs) -> None:
22
25
  pass
@@ -55,7 +58,7 @@ class Transformer(ABC):
55
58
  to_keep = ["timestamp", *expected]
56
59
  df.drop(columns=df.columns.difference(to_keep), inplace=True)
57
60
 
58
- df["timestamp"] = self._parse_timestamps(df["timestamp"])
61
+ df["timestamp"] = self.timestamp_parser.parse_series(df["timestamp"])
59
62
 
60
63
  df.drop_duplicates(subset=["timestamp"], keep="last")
61
64
  logging.info(f"standardized dataframe created: {df.shape}")
@@ -63,55 +66,3 @@ class Transformer(ABC):
63
66
  logging.info(f"{df.head()}")
64
67
 
65
68
  return df
66
-
67
- def _parse_timestamps(self, raw_series: pd.Series) -> pd.Series:
68
- """Return a Series of pandas UTC datetimes for the four supported modes."""
69
- logging.info(f"parsing timestamps. Format: {self.timestamp_format}")
70
-
71
- fmt = self.timestamp_format.lower()
72
-
73
- VALID_KEYS = {"utc", "iso8601", "constant"}
74
- if fmt not in VALID_KEYS and "%" not in self.timestamp_format:
75
- raise ValueError(
76
- f"timestamp_format must be one of {', '.join(VALID_KEYS)} "
77
- "or a valid strftime pattern."
78
- )
79
-
80
- series = raw_series.str.strip()
81
-
82
- if fmt == "utc":
83
- # Accept Z-suffix, no offset, fractional seconds, etc.
84
- parsed = pd.to_datetime(series, utc=True, errors="coerce")
85
-
86
- elif fmt == "iso8601":
87
- # pandas reads the embedded offset, then we shift to UTC
88
- parsed = pd.to_datetime(series, errors="coerce").dt.tz_convert("UTC")
89
-
90
- elif fmt == "constant":
91
- offset = str(self.timestamp_offset).strip()
92
- if not (len(offset) == 5 and offset[0] in "+-"):
93
- raise ValueError(f"Invalid timestampOffset: {self.timestamp_offset}")
94
-
95
- sign_multiplier = 1 if offset[0] == "+" else -1
96
- hours = int(offset[1:3])
97
- minutes = int(offset[3:5])
98
- total_minutes = sign_multiplier * (hours * 60 + minutes)
99
- local_timezone = timezone(timedelta(minutes=total_minutes))
100
-
101
- naive_times = pd.to_datetime(series, errors="coerce")
102
- localized_times = naive_times.dt.tz_localize(local_timezone)
103
- parsed = localized_times.dt.tz_convert("UTC")
104
-
105
- else:
106
- logging.info(f"timestamp format is custom {self.timestamp_format}")
107
- parsed = pd.to_datetime(
108
- series, format=self.timestamp_format, errors="coerce"
109
- ).dt.tz_localize("UTC")
110
-
111
- if parsed.isna().any():
112
- bad_rows = series[parsed.isna()].head(5).tolist()
113
- logging.warning(
114
- f"{parsed.isna().sum()} timestamps failed to parse. Sample bad values: {bad_rows}"
115
- )
116
-
117
- return parsed
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: hydroserverpy
3
- Version: 1.1.1
3
+ Version: 1.2.0
4
4
  Requires-Python: <4,>=3.9
5
5
  License-File: LICENSE
6
6
  Requires-Dist: requests>=2
@@ -52,6 +52,7 @@ src/hydroserverpy/api/services/sta/thing.py
52
52
  src/hydroserverpy/api/services/sta/unit.py
53
53
  src/hydroserverpy/etl/__init__.py
54
54
  src/hydroserverpy/etl/hydroserver_etl.py
55
+ src/hydroserverpy/etl/timestamp_parser.py
55
56
  src/hydroserverpy/etl/types.py
56
57
  src/hydroserverpy/etl/extractors/__init__.py
57
58
  src/hydroserverpy/etl/extractors/base.py
@@ -1,13 +0,0 @@
1
- from abc import ABC, abstractmethod
2
- from typing import Dict
3
- from ..types import TimeRange
4
-
5
-
6
- class Extractor(ABC):
7
- @abstractmethod
8
- def prepare_params(self, data_requirements: Dict[str, TimeRange]):
9
- pass
10
-
11
- @abstractmethod
12
- def extract(self):
13
- pass
@@ -1,99 +0,0 @@
1
- import logging
2
- from hydroserverpy.etl.types import TimeRange
3
- import requests
4
- from io import BytesIO
5
- from typing import Dict
6
- from .base import Extractor
7
-
8
-
9
- class HTTPExtractor(Extractor):
10
- def __init__(self, settings: object):
11
- self.url = settings["urlTemplate"]
12
- # self.url = self.format_url(url, url_variables or {})
13
- # self.params = settings.get('params', )
14
- # self.headers = headers
15
- # self.auth = auth
16
-
17
- def prepare_params(self, data_requirements: Dict[str, TimeRange]):
18
- pass
19
- # TODO: Uncomment this once url templates work on in the Data Management App
20
- # start_times = [
21
- # req["start_time"] for req in data_requirements.values() if req["start_time"]
22
- # ]
23
-
24
- # if start_times:
25
- # oldest_start_time = min(start_times)
26
- # start_time_key = self.params.pop("start_time_key", None)
27
- # if start_time_key:
28
- # self.params[start_time_key] = oldest_start_time
29
- # logging.info(
30
- # f"Set start_time to {oldest_start_time} and removed 'start_time_key'"
31
- # )
32
- # else:
33
- # logging.warning("'start_time_key' not found in params.")
34
-
35
- # end_times = [
36
- # req["end_time"] for req in data_requirements.values() if req["end_time"]
37
- # ]
38
-
39
- # if end_times:
40
- # newest_end_time = max(end_times)
41
- # end_time_key = self.params.pop("end_time_key", None)
42
- # if end_time_key:
43
- # self.params[end_time_key] = newest_end_time
44
- # logging.info(
45
- # f"Set end_time to {newest_end_time} and removed 'end_time_key'"
46
- # )
47
- # else:
48
- # logging.warning("'end_time_key' not found in params.")
49
-
50
- def extract(self):
51
- """
52
- Downloads the file from the HTTP/HTTPS server and returns a file-like object.
53
- """
54
-
55
- logging.info(f"Requesting data from → {self.url}")
56
-
57
- # endpoints = [
58
- # "https://httpbin.org/get",
59
- # "https://jsonplaceholder.typicode.com/posts/1",
60
- # "https://api.github.com",
61
- # "https://api.ipify.org?format=json",
62
- # "https://www.python.org/",
63
- # "https://waterservices.usgs.gov/nwis/iv/?&format=json&sites=01646500&parameterCd=00060",
64
- # "https://datahub.io/core/country-list/r/data.csv",
65
- # "https://raw.githubusercontent.com/cs109/2014_data/master/countries.csv",
66
- # # "https://rain-flow.slco.org/export/file/?delimiter=comma&site_id=68&data_start=2025-04-09&data_end=2025-05-09&device_id=2",
67
- # # "https://rain-flow.slco.org/export/file/?mime=txt&delimiter=comma&site_id=68&data_start=2025-05-09%2000:00:00&data_end=2025-05-09%2023:59:59&device_id=2"
68
- # ]
69
- # for url in endpoints:
70
- # try:
71
- # r = requests.get(url, timeout=10)
72
- # print(f"{url:50} → {r.status_code}")
73
- # except Exception as e:
74
- # print(f"{url:50} → ERROR: {e}")
75
-
76
- try:
77
- response = requests.get(self.url)
78
- except Exception as e:
79
- logging.error(f"Failed to fetch {repr(self.url)}: {e}")
80
- raise
81
-
82
- logging.info(f"Received response")
83
-
84
- data = BytesIO()
85
- for chunk in response.iter_content(chunk_size=8192):
86
- if chunk:
87
- data.write(chunk)
88
- data.seek(0)
89
- return data
90
-
91
- @staticmethod
92
- def format_url(url_template, url_variables):
93
- try:
94
- url = url_template.format(**url_variables)
95
- except KeyError as e:
96
- missing_key = e.args[0]
97
- raise KeyError(f"Missing configuration url_variable: {missing_key}")
98
-
99
- return url
@@ -1,29 +0,0 @@
1
- import logging
2
- from typing import Dict
3
-
4
- from .base import Extractor
5
- from ..types import TimeRange
6
-
7
-
8
- class LocalFileExtractor(Extractor):
9
- def __init__(self, settings: object):
10
- if "path" not in settings:
11
- message = "Missing required setting 'path' in LocalFileExtractor settings."
12
- logging.error(message)
13
- raise ValueError(message)
14
- self.path = settings["path"]
15
-
16
- def prepare_params(self, data_requirements: Dict[str, TimeRange]):
17
- pass
18
-
19
- def extract(self):
20
- """
21
- Opens the file and returns a file-like object.
22
- """
23
- try:
24
- file_handle = open(self.path, "r")
25
- logging.info(f"Successfully opened file '{self.path}'.")
26
- return file_handle
27
- except Exception as e:
28
- logging.error(f"Error opening file '{self.path}': {e}")
29
- return None
@@ -1,91 +0,0 @@
1
- import datetime
2
- from hydroserverpy import HydroServer
3
- from typing import Dict, Optional
4
-
5
- from hydroserverpy.etl.types import TimeRange
6
- from .base import Loader
7
- import logging
8
- import pandas as pd
9
-
10
-
11
- class HydroServerLoader(HydroServer, Loader):
12
- """
13
- A class that extends the HydroServer client with ETL-specific functionalities.
14
- """
15
-
16
- def __init__(
17
- self,
18
- host: str,
19
- email: Optional[str] = None,
20
- password: Optional[str] = None,
21
- apikey: Optional[str] = None,
22
- ):
23
- super().__init__(
24
- host=host,
25
- email=email,
26
- password=password,
27
- apikey=apikey,
28
- )
29
-
30
- def load(self, data: pd.DataFrame, payload_settings) -> None:
31
- """
32
- Load observations from a DataFrame to the HydroServer.
33
-
34
- :param data: A Pandas DataFrame where each column corresponds to a datastream.
35
- """
36
- mappings = payload_settings["mappings"]
37
- time_ranges = self.get_data_requirements(mappings)
38
- for ds_id in data.columns:
39
- if ds_id == "timestamp":
40
- continue
41
-
42
- df = data[["timestamp", ds_id]].copy()
43
- df.rename(columns={ds_id: "value"}, inplace=True)
44
- df.dropna(subset=["value"], inplace=True)
45
-
46
- # ensure the timestamp column is UTC‑aware
47
- timestamp_column = df["timestamp"]
48
- if timestamp_column.dt.tz is None:
49
- df["timestamp"] = timestamp_column.dt.tz_localize("UTC")
50
-
51
- time_range = time_ranges[ds_id]
52
- start_ts = pd.to_datetime(time_range["start_time"], utc=True)
53
-
54
- if start_ts:
55
- df = df[df["timestamp"] > start_ts]
56
- logging.info(f"start cutoff for data loading {start_ts}")
57
- if df.empty:
58
- logging.warning(
59
- f"No new data to upload for datastream {ds_id}. Skipping."
60
- )
61
- continue
62
- self.datastreams.load_observations(uid=ds_id, observations=df)
63
-
64
- def get_data_requirements(self, source_target_map) -> Dict[str, TimeRange]:
65
- """
66
- Each target system needs to be able to answer the question: 'What data do you need?'
67
- and return a time range for each target time series. Usually the answer will be
68
- 'anything newer than my most recent observation'.
69
- """
70
- data_requirements = {}
71
- target_ids = [mapping["targetIdentifier"] for mapping in source_target_map]
72
- for id in target_ids:
73
- datastream = self.datastreams.get(uid=id)
74
- if not datastream:
75
- message = "Couldn't fetch target datastream. ETL process aborted."
76
- logging.error(message)
77
- raise message
78
-
79
- start_ts = pd.Timestamp(
80
- datastream.phenomenon_end_time or "1970-01-01T00:00:00Z"
81
- )
82
- if start_ts.tzinfo is None:
83
- start_ts = start_ts.tz_localize("UTC")
84
-
85
- end_ts = pd.Timestamp.now(tz="UTC")
86
-
87
- data_requirements[id] = {
88
- "start_time": start_ts.isoformat(),
89
- "end_time": end_ts.isoformat(),
90
- }
91
- return data_requirements
File without changes
File without changes
File without changes