hydroserverpy 1.1.2__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of hydroserverpy might be problematic. Click here for more details.
- hydroserverpy/api/models/etl/data_source.py +2 -2
- hydroserverpy/etl/extractors/base.py +51 -7
- hydroserverpy/etl/extractors/http_extractor.py +7 -77
- hydroserverpy/etl/extractors/local_file_extractor.py +4 -14
- hydroserverpy/etl/loaders/base.py +1 -1
- hydroserverpy/etl/loaders/hydroserver_loader.py +34 -55
- hydroserverpy/etl/timestamp_parser.py +75 -0
- hydroserverpy/etl/transformers/base.py +6 -55
- {hydroserverpy-1.1.2.dist-info → hydroserverpy-1.2.0.dist-info}/METADATA +1 -1
- {hydroserverpy-1.1.2.dist-info → hydroserverpy-1.2.0.dist-info}/RECORD +14 -13
- {hydroserverpy-1.1.2.dist-info → hydroserverpy-1.2.0.dist-info}/WHEEL +0 -0
- {hydroserverpy-1.1.2.dist-info → hydroserverpy-1.2.0.dist-info}/licenses/LICENSE +0 -0
- {hydroserverpy-1.1.2.dist-info → hydroserverpy-1.2.0.dist-info}/top_level.txt +0 -0
- {hydroserverpy-1.1.2.dist-info → hydroserverpy-1.2.0.dist-info}/zip-safe +0 -0
|
@@ -129,7 +129,7 @@ class DataSource(HydroServerModel, DataSourceFields, OrchestrationConfigurationF
|
|
|
129
129
|
return
|
|
130
130
|
|
|
131
131
|
if self.settings["extractor"]["type"] == "local":
|
|
132
|
-
with open(self.settings["extractor"]["
|
|
132
|
+
with open(self.settings["extractor"]["sourceUri"]) as data_file:
|
|
133
133
|
loader = HydroServerETLCSV(
|
|
134
134
|
self._connection, data_file=data_file, data_source=self
|
|
135
135
|
)
|
|
@@ -137,7 +137,7 @@ class DataSource(HydroServerModel, DataSourceFields, OrchestrationConfigurationF
|
|
|
137
137
|
elif self.settings["extractor"]["type"] == "HTTP":
|
|
138
138
|
with tempfile.NamedTemporaryFile(mode="w+") as temp_file:
|
|
139
139
|
response = requests.get(
|
|
140
|
-
self.settings["extractor"]["
|
|
140
|
+
self.settings["extractor"]["sourceUri"],
|
|
141
141
|
stream=True,
|
|
142
142
|
timeout=60,
|
|
143
143
|
)
|
|
@@ -1,12 +1,56 @@
|
|
|
1
|
-
from abc import
|
|
2
|
-
|
|
3
|
-
|
|
1
|
+
from abc import abstractmethod
|
|
2
|
+
import logging
|
|
3
|
+
import pandas as pd
|
|
4
|
+
from datetime import datetime
|
|
4
5
|
|
|
6
|
+
from src.hydroserverpy.etl.timestamp_parser import TimestampParser
|
|
5
7
|
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
def
|
|
9
|
-
|
|
8
|
+
|
|
9
|
+
class Extractor:
|
|
10
|
+
def __init__(self, settings: dict):
|
|
11
|
+
self.settings = settings
|
|
12
|
+
self.source_uri = settings["sourceUri"]
|
|
13
|
+
|
|
14
|
+
def resolve_placeholder_variables(self, payload, loader):
|
|
15
|
+
logging.info(f"Creating runtime variables...")
|
|
16
|
+
filled = {}
|
|
17
|
+
for var in self.settings.get("placeholderVariables", []):
|
|
18
|
+
name = var["name"]
|
|
19
|
+
var_type = var.get("type", None)
|
|
20
|
+
|
|
21
|
+
if var_type == "runTime":
|
|
22
|
+
logging.info(f"Resolving runtime var: {name}")
|
|
23
|
+
if var.get("runTimeValue", None) == "latestObservationTimestamp":
|
|
24
|
+
value = loader.earliest_begin_date(payload)
|
|
25
|
+
elif var.get("runTimeValue", None) == "jobExecutionTime":
|
|
26
|
+
value = pd.Timestamp.now(tz="UTC")
|
|
27
|
+
elif var_type == "perPayload":
|
|
28
|
+
logging.info(f"Resolving payload var: {name}")
|
|
29
|
+
payload_vars = payload.get("extractorVariables", {})
|
|
30
|
+
if name not in payload_vars:
|
|
31
|
+
raise KeyError(f"Missing per-payload variable '{name}'")
|
|
32
|
+
value = payload_vars[name]
|
|
33
|
+
else:
|
|
34
|
+
continue
|
|
35
|
+
|
|
36
|
+
if isinstance(value, (datetime, pd.Timestamp)):
|
|
37
|
+
fmt = var.get("timestampFormat", "ISO8601")
|
|
38
|
+
offset = var.get("timestampOffset", "+0000")
|
|
39
|
+
parser = TimestampParser(fmt, offset)
|
|
40
|
+
value = parser.format(value)
|
|
41
|
+
|
|
42
|
+
filled[name] = value
|
|
43
|
+
if not filled:
|
|
44
|
+
return self.source_uri
|
|
45
|
+
return self.format_uri(filled)
|
|
46
|
+
|
|
47
|
+
def format_uri(self, placeholder_variables):
|
|
48
|
+
try:
|
|
49
|
+
uri = self.source_uri.format(**placeholder_variables)
|
|
50
|
+
except KeyError as e:
|
|
51
|
+
missing_key = e.args[0]
|
|
52
|
+
raise KeyError(f"Missing placeholder variable: {missing_key}")
|
|
53
|
+
return uri
|
|
10
54
|
|
|
11
55
|
@abstractmethod
|
|
12
56
|
def extract(self):
|
|
@@ -1,99 +1,29 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from hydroserverpy.etl.types import TimeRange
|
|
3
2
|
import requests
|
|
4
3
|
from io import BytesIO
|
|
5
|
-
from typing import Dict
|
|
6
4
|
from .base import Extractor
|
|
7
5
|
|
|
8
6
|
|
|
9
7
|
class HTTPExtractor(Extractor):
|
|
10
|
-
def __init__(self, settings:
|
|
11
|
-
|
|
12
|
-
# self.url = self.format_url(url, url_variables or {})
|
|
13
|
-
# self.params = settings.get('params', )
|
|
14
|
-
# self.headers = headers
|
|
15
|
-
# self.auth = auth
|
|
8
|
+
def __init__(self, settings: dict):
|
|
9
|
+
super().__init__(settings)
|
|
16
10
|
|
|
17
|
-
def
|
|
18
|
-
pass
|
|
19
|
-
# TODO: Uncomment this once url templates work on in the Data Management App
|
|
20
|
-
# start_times = [
|
|
21
|
-
# req["start_time"] for req in data_requirements.values() if req["start_time"]
|
|
22
|
-
# ]
|
|
23
|
-
|
|
24
|
-
# if start_times:
|
|
25
|
-
# oldest_start_time = min(start_times)
|
|
26
|
-
# start_time_key = self.params.pop("start_time_key", None)
|
|
27
|
-
# if start_time_key:
|
|
28
|
-
# self.params[start_time_key] = oldest_start_time
|
|
29
|
-
# logging.info(
|
|
30
|
-
# f"Set start_time to {oldest_start_time} and removed 'start_time_key'"
|
|
31
|
-
# )
|
|
32
|
-
# else:
|
|
33
|
-
# logging.warning("'start_time_key' not found in params.")
|
|
34
|
-
|
|
35
|
-
# end_times = [
|
|
36
|
-
# req["end_time"] for req in data_requirements.values() if req["end_time"]
|
|
37
|
-
# ]
|
|
38
|
-
|
|
39
|
-
# if end_times:
|
|
40
|
-
# newest_end_time = max(end_times)
|
|
41
|
-
# end_time_key = self.params.pop("end_time_key", None)
|
|
42
|
-
# if end_time_key:
|
|
43
|
-
# self.params[end_time_key] = newest_end_time
|
|
44
|
-
# logging.info(
|
|
45
|
-
# f"Set end_time to {newest_end_time} and removed 'end_time_key'"
|
|
46
|
-
# )
|
|
47
|
-
# else:
|
|
48
|
-
# logging.warning("'end_time_key' not found in params.")
|
|
49
|
-
|
|
50
|
-
def extract(self):
|
|
11
|
+
def extract(self, payload, loader=None):
|
|
51
12
|
"""
|
|
52
13
|
Downloads the file from the HTTP/HTTPS server and returns a file-like object.
|
|
53
14
|
"""
|
|
54
|
-
|
|
55
|
-
logging.info(f"Requesting data from → {
|
|
56
|
-
|
|
57
|
-
# endpoints = [
|
|
58
|
-
# "https://httpbin.org/get",
|
|
59
|
-
# "https://jsonplaceholder.typicode.com/posts/1",
|
|
60
|
-
# "https://api.github.com",
|
|
61
|
-
# "https://api.ipify.org?format=json",
|
|
62
|
-
# "https://www.python.org/",
|
|
63
|
-
# "https://waterservices.usgs.gov/nwis/iv/?&format=json&sites=01646500¶meterCd=00060",
|
|
64
|
-
# "https://datahub.io/core/country-list/r/data.csv",
|
|
65
|
-
# "https://raw.githubusercontent.com/cs109/2014_data/master/countries.csv",
|
|
66
|
-
# # "https://rain-flow.slco.org/export/file/?delimiter=comma&site_id=68&data_start=2025-04-09&data_end=2025-05-09&device_id=2",
|
|
67
|
-
# # "https://rain-flow.slco.org/export/file/?mime=txt&delimiter=comma&site_id=68&data_start=2025-05-09%2000:00:00&data_end=2025-05-09%2023:59:59&device_id=2"
|
|
68
|
-
# ]
|
|
69
|
-
# for url in endpoints:
|
|
70
|
-
# try:
|
|
71
|
-
# r = requests.get(url, timeout=10)
|
|
72
|
-
# print(f"{url:50} → {r.status_code}")
|
|
73
|
-
# except Exception as e:
|
|
74
|
-
# print(f"{url:50} → ERROR: {e}")
|
|
15
|
+
url = self.resolve_placeholder_variables(payload, loader)
|
|
16
|
+
logging.info(f"Requesting data from → {url}")
|
|
75
17
|
|
|
76
18
|
try:
|
|
77
|
-
response = requests.get(
|
|
19
|
+
response = requests.get(url)
|
|
78
20
|
except Exception as e:
|
|
79
|
-
logging.error(f"Failed to fetch {
|
|
21
|
+
logging.error(f"Failed to fetch {url}: {e}")
|
|
80
22
|
raise
|
|
81
23
|
|
|
82
|
-
logging.info(f"Received response")
|
|
83
|
-
|
|
84
24
|
data = BytesIO()
|
|
85
25
|
for chunk in response.iter_content(chunk_size=8192):
|
|
86
26
|
if chunk:
|
|
87
27
|
data.write(chunk)
|
|
88
28
|
data.seek(0)
|
|
89
29
|
return data
|
|
90
|
-
|
|
91
|
-
@staticmethod
|
|
92
|
-
def format_url(url_template, url_variables):
|
|
93
|
-
try:
|
|
94
|
-
url = url_template.format(**url_variables)
|
|
95
|
-
except KeyError as e:
|
|
96
|
-
missing_key = e.args[0]
|
|
97
|
-
raise KeyError(f"Missing configuration url_variable: {missing_key}")
|
|
98
|
-
|
|
99
|
-
return url
|
|
@@ -1,29 +1,19 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from typing import Dict
|
|
3
|
-
|
|
4
2
|
from .base import Extractor
|
|
5
|
-
from ..types import TimeRange
|
|
6
3
|
|
|
7
4
|
|
|
8
5
|
class LocalFileExtractor(Extractor):
|
|
9
6
|
def __init__(self, settings: object):
|
|
10
|
-
|
|
11
|
-
message = "Missing required setting 'path' in LocalFileExtractor settings."
|
|
12
|
-
logging.error(message)
|
|
13
|
-
raise ValueError(message)
|
|
14
|
-
self.path = settings["path"]
|
|
15
|
-
|
|
16
|
-
def prepare_params(self, data_requirements: Dict[str, TimeRange]):
|
|
17
|
-
pass
|
|
7
|
+
super().__init__(settings)
|
|
18
8
|
|
|
19
9
|
def extract(self):
|
|
20
10
|
"""
|
|
21
11
|
Opens the file and returns a file-like object.
|
|
22
12
|
"""
|
|
23
13
|
try:
|
|
24
|
-
file_handle = open(self.
|
|
25
|
-
logging.info(f"Successfully opened file '{self.
|
|
14
|
+
file_handle = open(self.source_uri, "r")
|
|
15
|
+
logging.info(f"Successfully opened file '{self.source_uri}'.")
|
|
26
16
|
return file_handle
|
|
27
17
|
except Exception as e:
|
|
28
|
-
logging.error(f"Error opening file '{self.
|
|
18
|
+
logging.error(f"Error opening file '{self.source_uri}': {e}")
|
|
29
19
|
return None
|
|
@@ -1,8 +1,6 @@
|
|
|
1
|
-
import datetime
|
|
2
1
|
from hydroserverpy import HydroServer
|
|
3
|
-
from typing import
|
|
2
|
+
from typing import Optional
|
|
4
3
|
|
|
5
|
-
from hydroserverpy.etl.types import TimeRange
|
|
6
4
|
from .base import Loader
|
|
7
5
|
import logging
|
|
8
6
|
import pandas as pd
|
|
@@ -26,66 +24,47 @@ class HydroServerLoader(HydroServer, Loader):
|
|
|
26
24
|
password=password,
|
|
27
25
|
apikey=apikey,
|
|
28
26
|
)
|
|
27
|
+
self._begin_cache: dict[str, str] = {}
|
|
29
28
|
|
|
30
|
-
def load(self, data: pd.DataFrame,
|
|
29
|
+
def load(self, data: pd.DataFrame, payload) -> None:
|
|
31
30
|
"""
|
|
32
31
|
Load observations from a DataFrame to the HydroServer.
|
|
33
|
-
|
|
34
32
|
:param data: A Pandas DataFrame where each column corresponds to a datastream.
|
|
35
33
|
"""
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
for
|
|
39
|
-
|
|
34
|
+
begin_date = self.earliest_begin_date(payload)
|
|
35
|
+
new_data = data[data["timestamp"] > begin_date]
|
|
36
|
+
for col in new_data.columns.difference(["timestamp"]):
|
|
37
|
+
df = (
|
|
38
|
+
new_data[["timestamp", col]]
|
|
39
|
+
.rename(columns={col: "value"})
|
|
40
|
+
.dropna(subset=["value"])
|
|
41
|
+
)
|
|
42
|
+
if df.empty:
|
|
43
|
+
logging.warning(f"No new data for {col}, skipping.")
|
|
40
44
|
continue
|
|
45
|
+
logging.info(f"loading dataframe {df}")
|
|
46
|
+
logging.info(f"dtypes: {df.dtypes}")
|
|
41
47
|
|
|
42
|
-
df =
|
|
43
|
-
|
|
44
|
-
df.dropna(subset=["value"], inplace=True)
|
|
48
|
+
df["value"] = pd.to_numeric(df["value"], errors="raise")
|
|
49
|
+
self.datastreams.load_observations(uid=col, observations=df)
|
|
45
50
|
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
51
|
+
def _fetch_earliest_begin(self, mappings: list[dict]) -> pd.Timestamp:
|
|
52
|
+
timestamps = []
|
|
53
|
+
for m in mappings:
|
|
54
|
+
ds = self.datastreams.get(uid=m["targetIdentifier"])
|
|
55
|
+
if not ds:
|
|
56
|
+
raise RuntimeError(f"Datastream {m['targetIdentifier']} not found.")
|
|
57
|
+
raw = ds.phenomenon_end_time or "1970-01-01"
|
|
58
|
+
ts = pd.to_datetime(raw, utc=True)
|
|
59
|
+
logging.info(f"timestamp {ts}")
|
|
60
|
+
timestamps.append(ts)
|
|
61
|
+
return min(timestamps)
|
|
50
62
|
|
|
51
|
-
|
|
52
|
-
start_ts = pd.to_datetime(time_range["start_time"], utc=True)
|
|
53
|
-
|
|
54
|
-
if start_ts:
|
|
55
|
-
df = df[df["timestamp"] > start_ts]
|
|
56
|
-
logging.info(f"start cutoff for data loading {start_ts}")
|
|
57
|
-
if df.empty:
|
|
58
|
-
logging.warning(
|
|
59
|
-
f"No new data to upload for datastream {ds_id}. Skipping."
|
|
60
|
-
)
|
|
61
|
-
continue
|
|
62
|
-
self.datastreams.load_observations(uid=ds_id, observations=df)
|
|
63
|
-
|
|
64
|
-
def get_data_requirements(self, source_target_map) -> Dict[str, TimeRange]:
|
|
63
|
+
def earliest_begin_date(self, payload: dict) -> pd.Timestamp:
|
|
65
64
|
"""
|
|
66
|
-
|
|
67
|
-
and return a time range for each target time series. Usually the answer will be
|
|
68
|
-
'anything newer than my most recent observation'.
|
|
65
|
+
Return earliest begin date for a payload, or compute+cache it on first call.
|
|
69
66
|
"""
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
if not datastream:
|
|
75
|
-
message = "Couldn't fetch target datastream. ETL process aborted."
|
|
76
|
-
logging.error(message)
|
|
77
|
-
raise message
|
|
78
|
-
|
|
79
|
-
start_ts = pd.Timestamp(
|
|
80
|
-
datastream.phenomenon_end_time or "1970-01-01T00:00:00Z"
|
|
81
|
-
)
|
|
82
|
-
if start_ts.tzinfo is None:
|
|
83
|
-
start_ts = start_ts.tz_localize("UTC")
|
|
84
|
-
|
|
85
|
-
end_ts = pd.Timestamp.now(tz="UTC")
|
|
86
|
-
|
|
87
|
-
data_requirements[id] = {
|
|
88
|
-
"start_time": start_ts.isoformat(),
|
|
89
|
-
"end_time": end_ts.isoformat(),
|
|
90
|
-
}
|
|
91
|
-
return data_requirements
|
|
67
|
+
key = payload["name"]
|
|
68
|
+
if key not in self._begin_cache:
|
|
69
|
+
self._begin_cache[key] = self._fetch_earliest_begin(payload["mappings"])
|
|
70
|
+
return self._begin_cache[key]
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from datetime import datetime, timedelta, timezone
|
|
3
|
+
from typing import Union
|
|
4
|
+
import pandas as pd
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class TimestampParser:
|
|
8
|
+
def __init__(
|
|
9
|
+
self, timestamp_format: str = "ISO8601", timestamp_offset: str = "+0000"
|
|
10
|
+
):
|
|
11
|
+
VALID_KEYS = {"utc", "iso8601", "constant"}
|
|
12
|
+
self.timestamp_offset = timestamp_offset
|
|
13
|
+
self.timestamp_format = timestamp_format
|
|
14
|
+
|
|
15
|
+
if (
|
|
16
|
+
self.timestamp_format.lower() not in VALID_KEYS
|
|
17
|
+
and "%" not in self.timestamp_format
|
|
18
|
+
):
|
|
19
|
+
raise ValueError(
|
|
20
|
+
f"timestamp_format must be one of {', '.join(VALID_KEYS)} "
|
|
21
|
+
"or a valid strftime pattern."
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
def parse_series(self, raw_series: pd.Series) -> pd.Series:
|
|
25
|
+
s = raw_series.str.strip()
|
|
26
|
+
if self.timestamp_format.lower() == "utc":
|
|
27
|
+
parsed = pd.to_datetime(s, utc=True, errors="coerce")
|
|
28
|
+
|
|
29
|
+
elif self.timestamp_format.lower() == "iso8601":
|
|
30
|
+
parsed = pd.to_datetime(s, errors="coerce").dt.tz_convert("UTC")
|
|
31
|
+
|
|
32
|
+
elif self.timestamp_format.lower() == "constant":
|
|
33
|
+
off = self.timestamp_offset.strip()
|
|
34
|
+
if not (len(off) == 5 and off[0] in "+-"):
|
|
35
|
+
raise ValueError(f"Invalid timestamp_offset: {off}")
|
|
36
|
+
sign = 1 if off[0] == "+" else -1
|
|
37
|
+
hrs, mins = int(off[1:3]), int(off[3:5])
|
|
38
|
+
tz = timezone(timedelta(minutes=sign * (hrs * 60 + mins)))
|
|
39
|
+
naive = pd.to_datetime(s, errors="coerce")
|
|
40
|
+
parsed = naive.dt.tz_localize(tz).dt.tz_convert("UTC")
|
|
41
|
+
|
|
42
|
+
else:
|
|
43
|
+
parsed = pd.to_datetime(
|
|
44
|
+
s, format=self.timestamp_format, errors="coerce"
|
|
45
|
+
).dt.tz_localize("UTC")
|
|
46
|
+
|
|
47
|
+
if parsed.isna().any():
|
|
48
|
+
bad_rows = s[parsed.isna()].head(5).tolist()
|
|
49
|
+
logging.warning(
|
|
50
|
+
f"{parsed.isna().sum()} timestamps failed to parse. "
|
|
51
|
+
f"Sample bad values: {bad_rows}"
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
return parsed
|
|
55
|
+
|
|
56
|
+
def format(self, dt: Union[datetime, pd.Timestamp]) -> str:
|
|
57
|
+
if isinstance(dt, pd.Timestamp):
|
|
58
|
+
dt = dt.to_pydatetime()
|
|
59
|
+
|
|
60
|
+
fmt = self.timestamp_format.lower()
|
|
61
|
+
if fmt == "utc":
|
|
62
|
+
return dt.astimezone(timezone.utc).strftime("%Y-%m-%dT%H:%M:%S")
|
|
63
|
+
|
|
64
|
+
if fmt == "iso8601":
|
|
65
|
+
return dt.astimezone(timezone.utc).isoformat()
|
|
66
|
+
|
|
67
|
+
if fmt == "constant":
|
|
68
|
+
off = self.timestamp_offset.strip()
|
|
69
|
+
sign = 1 if off[0] == "+" else -1
|
|
70
|
+
hrs, mins = int(off[1:3]), int(off[3:5])
|
|
71
|
+
tz = timezone(timedelta(minutes=sign * (hrs * 60 + mins)))
|
|
72
|
+
return dt.astimezone(tz).strftime("%Y-%m-%dT%H:%M:%S")
|
|
73
|
+
|
|
74
|
+
# custom strftime
|
|
75
|
+
return dt.strftime(self.timestamp_format)
|
|
@@ -1,8 +1,7 @@
|
|
|
1
1
|
from abc import ABC, abstractmethod
|
|
2
|
-
from datetime import timedelta, timezone
|
|
3
2
|
import logging
|
|
4
3
|
from typing import Union
|
|
5
|
-
|
|
4
|
+
from src.hydroserverpy.etl.timestamp_parser import TimestampParser
|
|
6
5
|
|
|
7
6
|
|
|
8
7
|
class Transformer(ABC):
|
|
@@ -17,6 +16,10 @@ class Transformer(ABC):
|
|
|
17
16
|
# Users will always interact in 1-based, so if the key is a column index, convert to 0-based
|
|
18
17
|
self.timestamp_key = self.timestamp_key - 1
|
|
19
18
|
|
|
19
|
+
self.timestamp_parser = TimestampParser(
|
|
20
|
+
self.timestamp_format, self.timestamp_offset
|
|
21
|
+
)
|
|
22
|
+
|
|
20
23
|
@abstractmethod
|
|
21
24
|
def transform(self, *args, **kwargs) -> None:
|
|
22
25
|
pass
|
|
@@ -55,7 +58,7 @@ class Transformer(ABC):
|
|
|
55
58
|
to_keep = ["timestamp", *expected]
|
|
56
59
|
df.drop(columns=df.columns.difference(to_keep), inplace=True)
|
|
57
60
|
|
|
58
|
-
df["timestamp"] = self.
|
|
61
|
+
df["timestamp"] = self.timestamp_parser.parse_series(df["timestamp"])
|
|
59
62
|
|
|
60
63
|
df.drop_duplicates(subset=["timestamp"], keep="last")
|
|
61
64
|
logging.info(f"standardized dataframe created: {df.shape}")
|
|
@@ -63,55 +66,3 @@ class Transformer(ABC):
|
|
|
63
66
|
logging.info(f"{df.head()}")
|
|
64
67
|
|
|
65
68
|
return df
|
|
66
|
-
|
|
67
|
-
def _parse_timestamps(self, raw_series: pd.Series) -> pd.Series:
|
|
68
|
-
"""Return a Series of pandas UTC datetimes for the four supported modes."""
|
|
69
|
-
logging.info(f"parsing timestamps. Format: {self.timestamp_format}")
|
|
70
|
-
|
|
71
|
-
fmt = self.timestamp_format.lower()
|
|
72
|
-
|
|
73
|
-
VALID_KEYS = {"utc", "iso8601", "constant"}
|
|
74
|
-
if fmt not in VALID_KEYS and "%" not in self.timestamp_format:
|
|
75
|
-
raise ValueError(
|
|
76
|
-
f"timestamp_format must be one of {', '.join(VALID_KEYS)} "
|
|
77
|
-
"or a valid strftime pattern."
|
|
78
|
-
)
|
|
79
|
-
|
|
80
|
-
series = raw_series.str.strip()
|
|
81
|
-
|
|
82
|
-
if fmt == "utc":
|
|
83
|
-
# Accept Z-suffix, no offset, fractional seconds, etc.
|
|
84
|
-
parsed = pd.to_datetime(series, utc=True, errors="coerce")
|
|
85
|
-
|
|
86
|
-
elif fmt == "iso8601":
|
|
87
|
-
# pandas reads the embedded offset, then we shift to UTC
|
|
88
|
-
parsed = pd.to_datetime(series, errors="coerce").dt.tz_convert("UTC")
|
|
89
|
-
|
|
90
|
-
elif fmt == "constant":
|
|
91
|
-
offset = str(self.timestamp_offset).strip()
|
|
92
|
-
if not (len(offset) == 5 and offset[0] in "+-"):
|
|
93
|
-
raise ValueError(f"Invalid timestampOffset: {self.timestamp_offset}")
|
|
94
|
-
|
|
95
|
-
sign_multiplier = 1 if offset[0] == "+" else -1
|
|
96
|
-
hours = int(offset[1:3])
|
|
97
|
-
minutes = int(offset[3:5])
|
|
98
|
-
total_minutes = sign_multiplier * (hours * 60 + minutes)
|
|
99
|
-
local_timezone = timezone(timedelta(minutes=total_minutes))
|
|
100
|
-
|
|
101
|
-
naive_times = pd.to_datetime(series, errors="coerce")
|
|
102
|
-
localized_times = naive_times.dt.tz_localize(local_timezone)
|
|
103
|
-
parsed = localized_times.dt.tz_convert("UTC")
|
|
104
|
-
|
|
105
|
-
else:
|
|
106
|
-
logging.info(f"timestamp format is custom {self.timestamp_format}")
|
|
107
|
-
parsed = pd.to_datetime(
|
|
108
|
-
series, format=self.timestamp_format, errors="coerce"
|
|
109
|
-
).dt.tz_localize("UTC")
|
|
110
|
-
|
|
111
|
-
if parsed.isna().any():
|
|
112
|
-
bad_rows = series[parsed.isna()].head(5).tolist()
|
|
113
|
-
logging.warning(
|
|
114
|
-
f"{parsed.isna().sum()} timestamps failed to parse. Sample bad values: {bad_rows}"
|
|
115
|
-
)
|
|
116
|
-
|
|
117
|
-
return parsed
|
|
@@ -6,7 +6,7 @@ hydroserverpy/api/models/__init__.py,sha256=buOhJ2Bf9yI0GftSyulpR74A5IhyyKJrXY1x
|
|
|
6
6
|
hydroserverpy/api/models/base.py,sha256=dc2tfMSgizymxAAOVURfy7Jzeh6xIiiq7hfWZI7l1_Q,2280
|
|
7
7
|
hydroserverpy/api/models/etl/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
8
8
|
hydroserverpy/api/models/etl/data_archive.py,sha256=u-gpvUsaWaw0kyF3bPMm2e55Jx2yhvSV9ufXXaNtrTc,3429
|
|
9
|
-
hydroserverpy/api/models/etl/data_source.py,sha256=
|
|
9
|
+
hydroserverpy/api/models/etl/data_source.py,sha256=th0DzyuFA5JM4nq_DtkI8GsPHMyy9zr3t6t5QWJXIao,5541
|
|
10
10
|
hydroserverpy/api/models/etl/orchestration_configuration.py,sha256=ElSrgi7ioFZJFJg6aGogW5ZZk7fA17y4p--yWwiOhZ0,1367
|
|
11
11
|
hydroserverpy/api/models/etl/orchestration_system.py,sha256=25En2G0z1gQzN-RW3UlrEGgkC952QDW21oYnawCX8hY,2357
|
|
12
12
|
hydroserverpy/api/models/iam/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -41,17 +41,18 @@ hydroserverpy/api/services/sta/thing.py,sha256=QL7IBwHHIgDFBpXnQF-LOUpxiRlm_HFWB
|
|
|
41
41
|
hydroserverpy/api/services/sta/unit.py,sha256=ksO-3g___9pPNBNbgM0jyDf1NeBqX79fjeJjCshrftY,2138
|
|
42
42
|
hydroserverpy/etl/__init__.py,sha256=qK2m4LZl8czR3VE8SxrlipSy5tLGLNB60lxD7dD0GjU,659
|
|
43
43
|
hydroserverpy/etl/hydroserver_etl.py,sha256=FSdvM3T7QHEWWulWRT8t-FMHSxAGB4GvleUXtSk5IWc,1507
|
|
44
|
+
hydroserverpy/etl/timestamp_parser.py,sha256=WYI8ARCSqE-TmYsOWk-iV-O0OBiIaA04v0DttCnskc4,2767
|
|
44
45
|
hydroserverpy/etl/types.py,sha256=4PY3CM-uoXIsf2lhcqtLC6HaRGXe7HKGDU22R8-H35c,135
|
|
45
46
|
hydroserverpy/etl/extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
46
|
-
hydroserverpy/etl/extractors/base.py,sha256=
|
|
47
|
+
hydroserverpy/etl/extractors/base.py,sha256=Y53NhCktI00z7XIU9ZAyyCmNt1WKHdQhag5sPN6-XV4,2102
|
|
47
48
|
hydroserverpy/etl/extractors/ftp_extractor.py,sha256=5LwvHuvLk6LwRSVyE9EkV3DPgVlAvRrOBpl1a8B7dLg,1387
|
|
48
|
-
hydroserverpy/etl/extractors/http_extractor.py,sha256=
|
|
49
|
-
hydroserverpy/etl/extractors/local_file_extractor.py,sha256=
|
|
49
|
+
hydroserverpy/etl/extractors/http_extractor.py,sha256=WxWyg-GLyr6Rb-2uCFniWe6Nmk71x-frmxgEYTr9juU,814
|
|
50
|
+
hydroserverpy/etl/extractors/local_file_extractor.py,sha256=WZ4xIg5FiJ5GbVuR71Uj9tw_vVyzGYeweWctKscUSW0,563
|
|
50
51
|
hydroserverpy/etl/loaders/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
51
|
-
hydroserverpy/etl/loaders/base.py,sha256=
|
|
52
|
-
hydroserverpy/etl/loaders/hydroserver_loader.py,sha256
|
|
52
|
+
hydroserverpy/etl/loaders/base.py,sha256=q3pTp8NqZUYF1IxwKp7TOA5b4HuJkhz3FD9tIqpL7iM,273
|
|
53
|
+
hydroserverpy/etl/loaders/hydroserver_loader.py,sha256=asqG_atwnj22xJI5CWbr1SnEHBZm0Kt0PlJ3hk8EJrM,2457
|
|
53
54
|
hydroserverpy/etl/transformers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
54
|
-
hydroserverpy/etl/transformers/base.py,sha256=
|
|
55
|
+
hydroserverpy/etl/transformers/base.py,sha256=a2_R6Bb6A9JI48B06x-0LB2YJB4qvZE6ChuAR3wjcwY,2585
|
|
55
56
|
hydroserverpy/etl/transformers/csv_transformer.py,sha256=0kWfRKPwiGxCNZ87Q4SiBlfM3PuKL6upc1ljphBY89o,2891
|
|
56
57
|
hydroserverpy/etl/transformers/json_transformer.py,sha256=R7tSyDB4Wn1snP75ctbEDMaMCdjyhPnMzN_W2VV3Mv4,1506
|
|
57
58
|
hydroserverpy/etl_csv/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -59,9 +60,9 @@ hydroserverpy/etl_csv/exceptions.py,sha256=0UY8YUlNepG0y6FfH36hJyR1bOhwYHSZIdUSS
|
|
|
59
60
|
hydroserverpy/etl_csv/hydroserver_etl_csv.py,sha256=0ueBphEaAAlsb0cn71Ihgd5zOD8Zdu4Ts_yGwvXW53M,14544
|
|
60
61
|
hydroserverpy/quality/__init__.py,sha256=GGBMkFSXciJLYrbV-NraFrj_mXWCy_GTcy9KKrKXU4c,84
|
|
61
62
|
hydroserverpy/quality/service.py,sha256=U02UfLKVmFvr5ySiH0n0JYzUIabq5uprrHIiwcqBlqY,13879
|
|
62
|
-
hydroserverpy-1.
|
|
63
|
-
hydroserverpy-1.
|
|
64
|
-
hydroserverpy-1.
|
|
65
|
-
hydroserverpy-1.
|
|
66
|
-
hydroserverpy-1.
|
|
67
|
-
hydroserverpy-1.
|
|
63
|
+
hydroserverpy-1.2.0.dist-info/licenses/LICENSE,sha256=xVqFxDw3QOEJukakL7gQCqIMTQ1dlSCTo6Oc1otNW80,1508
|
|
64
|
+
hydroserverpy-1.2.0.dist-info/METADATA,sha256=vMQFDFTg04VH7-UNWxbrGPFwp-ATawsyzBEjHHc3GbY,530
|
|
65
|
+
hydroserverpy-1.2.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
66
|
+
hydroserverpy-1.2.0.dist-info/top_level.txt,sha256=Zf37hrncXLOYvXhgCrf5mZdeq81G9fShdE2LfYbtb7w,14
|
|
67
|
+
hydroserverpy-1.2.0.dist-info/zip-safe,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
|
|
68
|
+
hydroserverpy-1.2.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|