hydroserverpy 1.3.1__py3-none-any.whl → 1.4.0b4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of hydroserverpy might be problematic. Click here for more details.
- hydroserverpy/__init__.py +0 -2
- hydroserverpy/api/models/etl/__init__.py +26 -0
- hydroserverpy/api/models/etl/data_source.py +107 -72
- hydroserverpy/api/models/etl/etl_configuration.py +224 -0
- hydroserverpy/api/models/etl/extractors/__init__.py +6 -0
- hydroserverpy/{etl → api/models/etl}/extractors/base.py +16 -19
- hydroserverpy/{etl → api/models/etl}/extractors/http_extractor.py +7 -8
- hydroserverpy/api/models/etl/extractors/local_file_extractor.py +20 -0
- hydroserverpy/api/models/etl/factories.py +23 -0
- hydroserverpy/api/models/etl/loaders/__init__.py +4 -0
- hydroserverpy/{etl → api/models/etl}/loaders/base.py +0 -2
- hydroserverpy/api/models/etl/loaders/hydroserver_loader.py +100 -0
- hydroserverpy/api/models/etl/schedule.py +16 -0
- hydroserverpy/api/models/etl/status.py +14 -0
- hydroserverpy/{etl → api/models/etl}/timestamp_parser.py +4 -1
- hydroserverpy/api/models/etl/transformers/__init__.py +5 -0
- hydroserverpy/api/models/etl/transformers/base.py +137 -0
- hydroserverpy/{etl → api/models/etl}/transformers/csv_transformer.py +24 -13
- hydroserverpy/{etl → api/models/etl}/transformers/json_transformer.py +21 -6
- hydroserverpy/api/services/etl/data_source.py +1 -4
- {hydroserverpy-1.3.1.dist-info → hydroserverpy-1.4.0b4.dist-info}/METADATA +1 -1
- {hydroserverpy-1.3.1.dist-info → hydroserverpy-1.4.0b4.dist-info}/RECORD +28 -29
- hydroserverpy/etl/__init__.py +0 -21
- hydroserverpy/etl/extractors/__init__.py +0 -0
- hydroserverpy/etl/extractors/local_file_extractor.py +0 -19
- hydroserverpy/etl/hydroserver_etl.py +0 -40
- hydroserverpy/etl/loaders/__init__.py +0 -0
- hydroserverpy/etl/loaders/hydroserver_loader.py +0 -71
- hydroserverpy/etl/transformers/__init__.py +0 -0
- hydroserverpy/etl/transformers/base.py +0 -64
- hydroserverpy/etl_csv/__init__.py +0 -0
- hydroserverpy/etl_csv/exceptions.py +0 -14
- hydroserverpy/etl_csv/hydroserver_etl_csv.py +0 -342
- /hydroserverpy/{etl → api/models/etl}/extractors/ftp_extractor.py +0 -0
- /hydroserverpy/{etl → api/models/etl}/types.py +0 -0
- {hydroserverpy-1.3.1.dist-info → hydroserverpy-1.4.0b4.dist-info}/WHEEL +0 -0
- {hydroserverpy-1.3.1.dist-info → hydroserverpy-1.4.0b4.dist-info}/licenses/LICENSE +0 -0
- {hydroserverpy-1.3.1.dist-info → hydroserverpy-1.4.0b4.dist-info}/top_level.txt +0 -0
- {hydroserverpy-1.3.1.dist-info → hydroserverpy-1.4.0b4.dist-info}/zip-safe +0 -0
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from typing import TYPE_CHECKING
|
|
3
|
+
|
|
4
|
+
from .base import Loader
|
|
5
|
+
import logging
|
|
6
|
+
import pandas as pd
|
|
7
|
+
from ..etl_configuration import Payload, SourceTargetMapping
|
|
8
|
+
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
from hydroserverpy.api.client import HydroServer
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class HydroServerLoader(Loader):
|
|
14
|
+
"""
|
|
15
|
+
A class that extends the HydroServer client with ETL-specific functionalities.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
def __init__(self, client: HydroServer, data_source_id):
|
|
19
|
+
self.client = client
|
|
20
|
+
self._begin_cache: dict[str, pd.Timestamp] = {}
|
|
21
|
+
self.data_source_id = data_source_id
|
|
22
|
+
|
|
23
|
+
def load(self, data: pd.DataFrame, payload: Payload) -> None:
|
|
24
|
+
"""
|
|
25
|
+
Load observations from a DataFrame to the HydroServer.
|
|
26
|
+
:param data: A Pandas DataFrame where each column corresponds to a datastream.
|
|
27
|
+
"""
|
|
28
|
+
begin_date = self.earliest_begin_date(payload)
|
|
29
|
+
new_data = data[data["timestamp"] > begin_date]
|
|
30
|
+
for col in new_data.columns.difference(["timestamp"]):
|
|
31
|
+
df = (
|
|
32
|
+
new_data[["timestamp", col]]
|
|
33
|
+
.rename(columns={col: "value"})
|
|
34
|
+
.dropna(subset=["value"])
|
|
35
|
+
)
|
|
36
|
+
if df.empty:
|
|
37
|
+
logging.warning(f"No new data for {col}, skipping.")
|
|
38
|
+
continue
|
|
39
|
+
logging.info(f"loading dataframe {df}")
|
|
40
|
+
logging.info(f"dtypes: {df.dtypes}")
|
|
41
|
+
|
|
42
|
+
df = df.rename(columns={"timestamp": "phenomenon_time", "value": "result"})
|
|
43
|
+
|
|
44
|
+
# Chunked upload
|
|
45
|
+
CHUNK_SIZE = 5000
|
|
46
|
+
total = len(df)
|
|
47
|
+
for start in range(0, total, CHUNK_SIZE):
|
|
48
|
+
end = min(start + CHUNK_SIZE, total)
|
|
49
|
+
chunk = df.iloc[start:end]
|
|
50
|
+
logging.info(
|
|
51
|
+
"Uploading %s rows (%s-%s) to datastream %s",
|
|
52
|
+
len(chunk),
|
|
53
|
+
start,
|
|
54
|
+
end - 1,
|
|
55
|
+
col,
|
|
56
|
+
)
|
|
57
|
+
try:
|
|
58
|
+
self.client.datastreams.load_observations(
|
|
59
|
+
uid=str(col), observations=chunk
|
|
60
|
+
)
|
|
61
|
+
except Exception as e:
|
|
62
|
+
status = getattr(e, "status_code", None) or getattr(
|
|
63
|
+
getattr(e, "response", None), "status_code", None
|
|
64
|
+
)
|
|
65
|
+
if status == 409 or "409" in str(e) or "Conflict" in str(e):
|
|
66
|
+
logging.info(
|
|
67
|
+
"409 Conflict for datastream %s on rows %s-%s; skipping remainder for this stream.",
|
|
68
|
+
col,
|
|
69
|
+
start,
|
|
70
|
+
end - 1,
|
|
71
|
+
)
|
|
72
|
+
break
|
|
73
|
+
raise
|
|
74
|
+
|
|
75
|
+
def _fetch_earliest_begin(
|
|
76
|
+
self, mappings: list[SourceTargetMapping]
|
|
77
|
+
) -> pd.Timestamp:
|
|
78
|
+
logging.info("Querying HydroServer for earliest begin date for payload...")
|
|
79
|
+
timestamps = []
|
|
80
|
+
datastreams = self.client.datastreams.list(
|
|
81
|
+
data_source=self.data_source_id
|
|
82
|
+
).items
|
|
83
|
+
ds_by_uid = {str(ds.uid): ds for ds in datastreams}
|
|
84
|
+
for m in mappings:
|
|
85
|
+
for p in m.paths:
|
|
86
|
+
datastream = ds_by_uid[str(p.target_identifier)]
|
|
87
|
+
raw = datastream.phenomenon_end_time or "1970-01-01"
|
|
88
|
+
ts = pd.to_datetime(raw, utc=True)
|
|
89
|
+
timestamps.append(ts)
|
|
90
|
+
logging.info(f"Found earliest begin date: {min(timestamps)}")
|
|
91
|
+
return min(timestamps)
|
|
92
|
+
|
|
93
|
+
def earliest_begin_date(self, payload: Payload) -> pd.Timestamp:
|
|
94
|
+
"""
|
|
95
|
+
Return earliest begin date for a payload, or compute+cache it on first call.
|
|
96
|
+
"""
|
|
97
|
+
key = payload.name
|
|
98
|
+
if key not in self._begin_cache:
|
|
99
|
+
self._begin_cache[key] = self._fetch_earliest_begin(payload.mappings)
|
|
100
|
+
return self._begin_cache[key]
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
from datetime import datetime
|
|
2
|
+
from typing import Literal, Optional
|
|
3
|
+
from pydantic import BaseModel, Field
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class Schedule(BaseModel):
|
|
7
|
+
interval: int = Field(..., gt=0)
|
|
8
|
+
interval_units: Optional[Literal["minutes", "hours", "days"]] = Field(
|
|
9
|
+
None, alias="intervalUnits"
|
|
10
|
+
)
|
|
11
|
+
crontab: Optional[str]
|
|
12
|
+
start_time: Optional[datetime] = Field(None, alias="startTime")
|
|
13
|
+
end_time: Optional[datetime] = Field(None, alias="endTime")
|
|
14
|
+
|
|
15
|
+
class Config:
|
|
16
|
+
allow_population_by_field_name = True
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
from datetime import datetime
|
|
2
|
+
from typing import Optional
|
|
3
|
+
from pydantic import BaseModel, Field
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class Status(BaseModel):
|
|
7
|
+
paused: bool = Field(False)
|
|
8
|
+
last_run_successful: Optional[bool] = Field(None, alias="lastRunSuccessful")
|
|
9
|
+
last_run_message: Optional[str] = Field(None, alias="lastRunMessage")
|
|
10
|
+
last_run: Optional[datetime] = Field(None, alias="lastRun")
|
|
11
|
+
next_run: Optional[datetime] = Field(None, alias="nextRun")
|
|
12
|
+
|
|
13
|
+
class Config:
|
|
14
|
+
allow_population_by_field_name = True
|
|
@@ -73,7 +73,10 @@ class TimestampParser:
|
|
|
73
73
|
return localized.dt.tz_convert(timezone.utc)
|
|
74
74
|
|
|
75
75
|
def parse_series(self, raw_series: pd.Series) -> pd.Series:
|
|
76
|
-
|
|
76
|
+
if pd.api.types.is_datetime64_any_dtype(raw_series):
|
|
77
|
+
s = raw_series # already datetimes
|
|
78
|
+
else:
|
|
79
|
+
s = raw_series.astype("string", copy=False).str.strip()
|
|
77
80
|
parsed = self._convert_series_to_UTC(s)
|
|
78
81
|
|
|
79
82
|
if parsed.isna().any():
|
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
import ast
|
|
3
|
+
from functools import lru_cache
|
|
4
|
+
import logging
|
|
5
|
+
import re
|
|
6
|
+
from typing import List, Union
|
|
7
|
+
import pandas as pd
|
|
8
|
+
|
|
9
|
+
from ..timestamp_parser import TimestampParser
|
|
10
|
+
from ..etl_configuration import MappingPath, TransformerConfig, SourceTargetMapping
|
|
11
|
+
|
|
12
|
+
ALLOWED_AST = (
|
|
13
|
+
ast.Expression,
|
|
14
|
+
ast.BinOp,
|
|
15
|
+
ast.UnaryOp,
|
|
16
|
+
ast.Add,
|
|
17
|
+
ast.Sub,
|
|
18
|
+
ast.Mult,
|
|
19
|
+
ast.Div,
|
|
20
|
+
ast.UAdd,
|
|
21
|
+
ast.USub,
|
|
22
|
+
ast.Name,
|
|
23
|
+
ast.Load,
|
|
24
|
+
ast.Constant,
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _canonicalize_expr(expr: str) -> str:
|
|
29
|
+
# normalize whitespace for cache hits; parentheses remain intact
|
|
30
|
+
return re.sub(r"\s+", "", expr)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@lru_cache(maxsize=256)
|
|
34
|
+
def _compile_arithmetic_expr_canon(expr_no_ws: str):
|
|
35
|
+
tree = ast.parse(expr_no_ws, mode="eval")
|
|
36
|
+
for node in ast.walk(tree):
|
|
37
|
+
if not isinstance(node, ALLOWED_AST):
|
|
38
|
+
raise ValueError(
|
|
39
|
+
"Only +, -, *, / with 'x' and numeric literals are allowed."
|
|
40
|
+
)
|
|
41
|
+
if isinstance(node, ast.Name) and node.id != "x":
|
|
42
|
+
raise ValueError("Only the variable 'x' is allowed.")
|
|
43
|
+
if isinstance(node, ast.Constant):
|
|
44
|
+
val = node.value
|
|
45
|
+
if isinstance(val, bool) or not isinstance(val, (int, float)):
|
|
46
|
+
raise ValueError("Only numeric literals are allowed.")
|
|
47
|
+
return compile(tree, "<expr>", "eval")
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def _compile_arithmetic_expr(expr: str):
|
|
51
|
+
return _compile_arithmetic_expr_canon(_canonicalize_expr(expr))
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class Transformer(ABC):
|
|
55
|
+
def __init__(self, transformer_config: TransformerConfig):
|
|
56
|
+
self.cfg = transformer_config
|
|
57
|
+
self.timestamp = transformer_config.timestamp
|
|
58
|
+
self.timestamp_parser = TimestampParser(self.timestamp)
|
|
59
|
+
|
|
60
|
+
@abstractmethod
|
|
61
|
+
def transform(self, *args, **kwargs) -> None:
|
|
62
|
+
pass
|
|
63
|
+
|
|
64
|
+
@property
|
|
65
|
+
def needs_datastreams(self) -> bool:
|
|
66
|
+
return False
|
|
67
|
+
|
|
68
|
+
def standardize_dataframe(
|
|
69
|
+
self, df: pd.DataFrame, mappings: List[SourceTargetMapping]
|
|
70
|
+
):
|
|
71
|
+
logging.info(f"Successfully read payload into dataframe:\n {df}")
|
|
72
|
+
|
|
73
|
+
# 1) Normalize timestamp column
|
|
74
|
+
df.rename(columns={self.timestamp.key: "timestamp"}, inplace=True)
|
|
75
|
+
if "timestamp" not in df.columns:
|
|
76
|
+
msg = f"Timestamp column '{self.timestamp.key}' not found in data."
|
|
77
|
+
logging.error(msg)
|
|
78
|
+
raise ValueError(msg)
|
|
79
|
+
logging.info(f"Renamed timestamp column to 'timestamp'")
|
|
80
|
+
|
|
81
|
+
df["timestamp"] = self.timestamp_parser.parse_series(df["timestamp"])
|
|
82
|
+
logging.info(f"Normalized timestamp column \n {df}")
|
|
83
|
+
|
|
84
|
+
df = df.drop_duplicates(subset=["timestamp"], keep="last")
|
|
85
|
+
logging.info(f"Removed duplicates\n")
|
|
86
|
+
|
|
87
|
+
def _resolve_source_col(s_id: Union[str, int]) -> str:
|
|
88
|
+
if isinstance(s_id, int) and s_id not in df.columns:
|
|
89
|
+
try:
|
|
90
|
+
return df.columns[s_id]
|
|
91
|
+
except IndexError:
|
|
92
|
+
raise ValueError(
|
|
93
|
+
f"Source index {s_id} is out of range for extracted data."
|
|
94
|
+
)
|
|
95
|
+
if s_id not in df.columns:
|
|
96
|
+
raise ValueError(f"Source column '{s_id}' not found in extracted data.")
|
|
97
|
+
return s_id
|
|
98
|
+
|
|
99
|
+
def _apply_transformations(series: pd.Series, path: MappingPath) -> pd.Series:
|
|
100
|
+
out = series # accumulator for sequential transforms
|
|
101
|
+
if out.dtype == "object":
|
|
102
|
+
out = pd.to_numeric(out, errors="coerce")
|
|
103
|
+
|
|
104
|
+
for transformation in path.data_transformations:
|
|
105
|
+
if transformation.type == "expression":
|
|
106
|
+
code = _compile_arithmetic_expr(transformation.expression)
|
|
107
|
+
try:
|
|
108
|
+
out = eval(code, {"__builtins__": {}}, {"x": out})
|
|
109
|
+
except Exception as ee:
|
|
110
|
+
logging.exception(
|
|
111
|
+
"Data transformation failed for expression=%r",
|
|
112
|
+
transformation.expression,
|
|
113
|
+
)
|
|
114
|
+
raise
|
|
115
|
+
else:
|
|
116
|
+
msg = f"Unsupported transformation type: {transformation.type}"
|
|
117
|
+
logging.error(msg)
|
|
118
|
+
raise ValueError(msg)
|
|
119
|
+
return out
|
|
120
|
+
|
|
121
|
+
# source target mappings may be one to many. Therefore, create a new column for each target and apply transformations
|
|
122
|
+
transformed_df = pd.DataFrame(index=df.index)
|
|
123
|
+
for m in mappings:
|
|
124
|
+
src_col = _resolve_source_col(m.source_identifier)
|
|
125
|
+
base = df[src_col]
|
|
126
|
+
for path in m.paths:
|
|
127
|
+
target_col = str(path.target_identifier)
|
|
128
|
+
transformed_df[target_col] = _apply_transformations(base, path)
|
|
129
|
+
|
|
130
|
+
logging.info(f"Mapped payload sources to targets")
|
|
131
|
+
|
|
132
|
+
# 6) Keep only timestamp + target columns
|
|
133
|
+
df = pd.concat([df[["timestamp"]], pd.DataFrame(transformed_df)], axis=1)
|
|
134
|
+
|
|
135
|
+
logging.info(f"standardized dataframe created: {df.shape}")
|
|
136
|
+
|
|
137
|
+
return df
|
|
@@ -1,25 +1,28 @@
|
|
|
1
1
|
from io import StringIO
|
|
2
2
|
import logging
|
|
3
3
|
import pandas as pd
|
|
4
|
-
from typing import Iterable, Union
|
|
4
|
+
from typing import Iterable, List, Union
|
|
5
5
|
from .base import Transformer
|
|
6
|
+
from ..etl_configuration import TransformerConfig, SourceTargetMapping
|
|
6
7
|
|
|
7
8
|
|
|
8
9
|
class CSVTransformer(Transformer):
|
|
9
|
-
def __init__(self,
|
|
10
|
-
super().__init__(
|
|
10
|
+
def __init__(self, transformer_config: TransformerConfig):
|
|
11
|
+
super().__init__(transformer_config)
|
|
11
12
|
|
|
12
13
|
# Pandas is zero-based while CSV is one-based so convert
|
|
13
14
|
self.header_row = (
|
|
14
|
-
None if
|
|
15
|
+
None if self.cfg.header_row is None else self.cfg.header_row - 1
|
|
15
16
|
)
|
|
16
17
|
self.data_start_row = (
|
|
17
|
-
|
|
18
|
+
self.cfg.data_start_row - 1 if self.cfg.data_start_row else 0
|
|
18
19
|
)
|
|
19
|
-
self.delimiter =
|
|
20
|
-
self.identifier_type =
|
|
20
|
+
self.delimiter = self.cfg.delimiter or ","
|
|
21
|
+
self.identifier_type = self.cfg.identifier_type or "name"
|
|
21
22
|
|
|
22
|
-
def transform(
|
|
23
|
+
def transform(
|
|
24
|
+
self, data_file, mappings: List[SourceTargetMapping]
|
|
25
|
+
) -> Union[pd.DataFrame, None]:
|
|
23
26
|
"""
|
|
24
27
|
Transforms a CSV file-like object into a Pandas DataFrame where the column
|
|
25
28
|
names are replaced with their target datastream ids.
|
|
@@ -31,7 +34,14 @@ class CSVTransformer(Transformer):
|
|
|
31
34
|
"""
|
|
32
35
|
|
|
33
36
|
clean_file = self._strip_comments(data_file)
|
|
34
|
-
|
|
37
|
+
use_index = self.identifier_type == "index"
|
|
38
|
+
|
|
39
|
+
if use_index:
|
|
40
|
+
# Users will always interact in 1-based, so if the key is a column index, convert to 0-based to work with Pandas
|
|
41
|
+
timestamp_pos = int(self.timestamp.key) - 1
|
|
42
|
+
usecols = [timestamp_pos] + [int(m.source_identifier) - 1 for m in mappings]
|
|
43
|
+
else:
|
|
44
|
+
usecols = [self.timestamp.key] + [m.source_identifier for m in mappings]
|
|
35
45
|
|
|
36
46
|
try:
|
|
37
47
|
# Pandas’ heuristics strip offsets and silently coerce failures to strings.
|
|
@@ -42,16 +52,17 @@ class CSVTransformer(Transformer):
|
|
|
42
52
|
sep=self.delimiter,
|
|
43
53
|
header=self.header_row,
|
|
44
54
|
skiprows=self._build_skiprows(),
|
|
45
|
-
usecols=
|
|
46
|
-
dtype={self.
|
|
55
|
+
usecols=usecols,
|
|
56
|
+
dtype={self.timestamp.key: "string"},
|
|
47
57
|
)
|
|
48
58
|
logging.info(f"CSV file read into dataframe: {df.shape}")
|
|
49
59
|
except Exception as e:
|
|
50
60
|
logging.error(f"Error reading CSV data: {e}")
|
|
51
61
|
return None
|
|
52
62
|
|
|
53
|
-
|
|
54
|
-
|
|
63
|
+
# In index mode, relabel columns back to original 1-based indices so base transformer can use integer labels directly
|
|
64
|
+
if use_index:
|
|
65
|
+
df.columns = [(c + 1) if isinstance(c, int) else c for c in usecols]
|
|
55
66
|
|
|
56
67
|
return self.standardize_dataframe(df, mappings)
|
|
57
68
|
|
|
@@ -1,17 +1,18 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
import pandas as pd
|
|
3
|
-
from typing import
|
|
3
|
+
from typing import Optional, Any, List
|
|
4
4
|
from .base import Transformer
|
|
5
5
|
import json
|
|
6
6
|
import jmespath
|
|
7
|
+
from ..etl_configuration import TransformerConfig, SourceTargetMapping
|
|
7
8
|
|
|
8
9
|
|
|
9
10
|
class JSONTransformer(Transformer):
|
|
10
|
-
def __init__(self,
|
|
11
|
-
super().__init__(
|
|
12
|
-
self.
|
|
11
|
+
def __init__(self, transformer_config: TransformerConfig):
|
|
12
|
+
super().__init__(transformer_config)
|
|
13
|
+
self.jmespath = transformer_config.jmespath
|
|
13
14
|
|
|
14
|
-
def transform(self, data_file, mappings):
|
|
15
|
+
def transform(self, data_file, mappings: List[SourceTargetMapping]):
|
|
15
16
|
"""
|
|
16
17
|
Transforms a JSON file-like object into the standard Pandas dataframe format.
|
|
17
18
|
Since JMESPath can natively rename column names, the assumption is the timestamp column
|
|
@@ -23,7 +24,21 @@ class JSONTransformer(Transformer):
|
|
|
23
24
|
Returns:
|
|
24
25
|
pd.DataFrame: pandas DataFrames in the format pd.Timestamp, datastream_id_1, datastream_id_2, ...
|
|
25
26
|
"""
|
|
27
|
+
if data_file is None:
|
|
28
|
+
raise TypeError(
|
|
29
|
+
"JSONTransformer received None; expected file-like, bytes, or str"
|
|
30
|
+
)
|
|
31
|
+
|
|
26
32
|
json_data = json.load(data_file)
|
|
33
|
+
logging.info(f"Read in json data: \n{data_file}")
|
|
34
|
+
logging.info(
|
|
35
|
+
"JSONTransformer cfg:\n jmespath=%r\n ts.key=%r\n ts.format=%r\n ts.custom=%r",
|
|
36
|
+
self.cfg.jmespath,
|
|
37
|
+
self.timestamp.key,
|
|
38
|
+
self.timestamp.format,
|
|
39
|
+
self.timestamp.custom_format,
|
|
40
|
+
)
|
|
41
|
+
|
|
27
42
|
data_points = self.extract_data_points(json_data)
|
|
28
43
|
if not data_points:
|
|
29
44
|
logging.warning("No data points found in the JSON data.")
|
|
@@ -35,7 +50,7 @@ class JSONTransformer(Transformer):
|
|
|
35
50
|
|
|
36
51
|
def extract_data_points(self, json_data: Any) -> Optional[List[dict]]:
|
|
37
52
|
"""Extracts data points from the JSON data using the data_path."""
|
|
38
|
-
data_points = jmespath.search(self.
|
|
53
|
+
data_points = jmespath.search(self.jmespath, json_data)
|
|
39
54
|
|
|
40
55
|
if isinstance(data_points, dict):
|
|
41
56
|
data_points = [data_points]
|
|
@@ -77,10 +77,7 @@ class DataSourceService(HydroServerBaseService):
|
|
|
77
77
|
"paused": paused,
|
|
78
78
|
},
|
|
79
79
|
"datastreamIds": (
|
|
80
|
-
[
|
|
81
|
-
normalize_uuid(datastream)
|
|
82
|
-
for datastream in datastreams
|
|
83
|
-
]
|
|
80
|
+
[normalize_uuid(datastream) for datastream in datastreams]
|
|
84
81
|
if datastreams
|
|
85
82
|
else []
|
|
86
83
|
),
|
|
@@ -1,14 +1,32 @@
|
|
|
1
|
-
hydroserverpy/__init__.py,sha256=
|
|
1
|
+
hydroserverpy/__init__.py,sha256=xnuWIehUYshy05GptyIvHiD52FOjoWOloXAfT1LgP3U,150
|
|
2
2
|
hydroserverpy/api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
3
|
hydroserverpy/api/client.py,sha256=jduKZV2cOkPVRjIjAiVYnTncMfEtW6IaCb895Y_PfiI,5697
|
|
4
4
|
hydroserverpy/api/utils.py,sha256=1RUglpvegBZOcu9BEExxsAzaGOyu4tdUk2JyiBEbzxI,496
|
|
5
5
|
hydroserverpy/api/models/__init__.py,sha256=NLq95t1oC2co5aqVYSw9Pq0RAsLHnLjNq1tsgbMepTg,773
|
|
6
6
|
hydroserverpy/api/models/base.py,sha256=mQZbanDg9t2GN9mOR_XOOtAfYF7AkY0fBZ6fHat6NRs,6944
|
|
7
|
-
hydroserverpy/api/models/etl/__init__.py,sha256=
|
|
7
|
+
hydroserverpy/api/models/etl/__init__.py,sha256=_D8_Nbs06-47wwsHOAF9tOohQYQ52gOhz_NUyfloMUw,699
|
|
8
8
|
hydroserverpy/api/models/etl/data_archive.py,sha256=rnmD_FQ1yjJ0KPBigylAQ3uQ6QBppJtBopJK4oCPLSo,2613
|
|
9
|
-
hydroserverpy/api/models/etl/data_source.py,sha256=
|
|
9
|
+
hydroserverpy/api/models/etl/data_source.py,sha256=YWSdudFONe1eniGBkruLRAP8BDyt0tGmZD8hzzByjKU,5123
|
|
10
|
+
hydroserverpy/api/models/etl/etl_configuration.py,sha256=anD_0zlldJKogie15j9SIabJvNqnY5fz5xjuFsFUFwU,6034
|
|
11
|
+
hydroserverpy/api/models/etl/factories.py,sha256=-inTw_C694YieDU4vbrm1qyeZMEYZqVhHSyEQJLMueo,802
|
|
10
12
|
hydroserverpy/api/models/etl/orchestration_configuration.py,sha256=ElSrgi7ioFZJFJg6aGogW5ZZk7fA17y4p--yWwiOhZ0,1367
|
|
11
13
|
hydroserverpy/api/models/etl/orchestration_system.py,sha256=5wdGsXCMqHfE3--zG-3WAPAVPNMPIx99y-7UUhdCink,2060
|
|
14
|
+
hydroserverpy/api/models/etl/schedule.py,sha256=-TxRpYSFbyYkzAPBWOh5udx1s6v1SvLl3_LE2j_b1uE,512
|
|
15
|
+
hydroserverpy/api/models/etl/status.py,sha256=vYT7go7DMcOgy29w0yhHpKz6AdprLmOxWZE9G_DHVdw,503
|
|
16
|
+
hydroserverpy/api/models/etl/timestamp_parser.py,sha256=lDnParK2j2M9TF7qspJDeKFGGpO4d1F2KJEKZ4xH5Yw,4374
|
|
17
|
+
hydroserverpy/api/models/etl/types.py,sha256=4PY3CM-uoXIsf2lhcqtLC6HaRGXe7HKGDU22R8-H35c,135
|
|
18
|
+
hydroserverpy/api/models/etl/extractors/__init__.py,sha256=Z0viw2vk96Ytpz3n7ODtkYz9Zx0I0NsZUbna2ZWvhkw,243
|
|
19
|
+
hydroserverpy/api/models/etl/extractors/base.py,sha256=uLAdi1PrOVMtuCU1ZN_liBW_ElD2mklrBrQ_AZZQtNw,1949
|
|
20
|
+
hydroserverpy/api/models/etl/extractors/ftp_extractor.py,sha256=5LwvHuvLk6LwRSVyE9EkV3DPgVlAvRrOBpl1a8B7dLg,1387
|
|
21
|
+
hydroserverpy/api/models/etl/extractors/http_extractor.py,sha256=AgS0vDmHhN3do1FII-hNEvkK40lDjlS1iftHplWd1No,805
|
|
22
|
+
hydroserverpy/api/models/etl/extractors/local_file_extractor.py,sha256=AwC0T-F8D0S7zR0MUIQXKLfv9b0uU60YoUW615lgNl0,648
|
|
23
|
+
hydroserverpy/api/models/etl/loaders/__init__.py,sha256=rEqYo1Tim7Fzrp1jPhV_yn3ll90dUGMAjcieEqh_4Pk,118
|
|
24
|
+
hydroserverpy/api/models/etl/loaders/base.py,sha256=J3dqm_b6BmEsF7VR3sUxBVQpLJsRx7fTIir5v5TORE0,229
|
|
25
|
+
hydroserverpy/api/models/etl/loaders/hydroserver_loader.py,sha256=0qdsSu4bLD4R86eSyQY2Qwg1HmMsruVjFhBKCrErsBU,3868
|
|
26
|
+
hydroserverpy/api/models/etl/transformers/__init__.py,sha256=YQhjdoRdq4xikLWETnsRIaMvCae5flRpPrfw9lj9pOA,184
|
|
27
|
+
hydroserverpy/api/models/etl/transformers/base.py,sha256=Qt9U6rRFS8aq142n0Cig1wMkVC9-4IBiXIE1nAjEPDY,4971
|
|
28
|
+
hydroserverpy/api/models/etl/transformers/csv_transformer.py,sha256=06AWV9S9we4LRQLpn5WMVl7sX6ylDKPb2KHNC0Jiu7o,3478
|
|
29
|
+
hydroserverpy/api/models/etl/transformers/json_transformer.py,sha256=puKQI8abWJEQTcw34lEHgHjuPOuzcSBv95_txErzchk,2116
|
|
12
30
|
hydroserverpy/api/models/iam/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
13
31
|
hydroserverpy/api/models/iam/account.py,sha256=7COk_CPYFlthg1uFWTBlJESfnuqMW90TSjZoIcBb-_8,439
|
|
14
32
|
hydroserverpy/api/models/iam/apikey.py,sha256=Z4iXg_K056naT3ogwc5wzyNnRpxHkOCz0lk-Gim4eL8,3146
|
|
@@ -28,7 +46,7 @@ hydroserverpy/api/services/__init__.py,sha256=Nb7rc1Zt8kpRElgFdWPdcyUDrtm7XdJDgz
|
|
|
28
46
|
hydroserverpy/api/services/base.py,sha256=f7CoQ1m-pdgVwqJsdvE7vcannw-3i7yJgBMI4eHZxAQ,3725
|
|
29
47
|
hydroserverpy/api/services/etl/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
30
48
|
hydroserverpy/api/services/etl/data_archive.py,sha256=-Pmv9EqNJncVX3gPDIeNM4TsR6fHgOIjmMGt9fGOeYg,5842
|
|
31
|
-
hydroserverpy/api/services/etl/data_source.py,sha256=
|
|
49
|
+
hydroserverpy/api/services/etl/data_source.py,sha256=XWWgbVyhyZxRt4s6wBc9-lnv_O86Bte1Vk3_Aza4HGY,5773
|
|
32
50
|
hydroserverpy/api/services/etl/orchestration_system.py,sha256=Otj_DiFpFBQzSc4Ei7LxneBf3VPnodI0pqoQM2BldcM,1935
|
|
33
51
|
hydroserverpy/api/services/iam/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
34
52
|
hydroserverpy/api/services/iam/role.py,sha256=PV0odC_lL9kV3ggrTjAUEMTo0WNUzv4AeMHNAXlkbN8,1137
|
|
@@ -41,30 +59,11 @@ hydroserverpy/api/services/sta/result_qualifier.py,sha256=gkgofUqzGXgdkyAvK9RW_d
|
|
|
41
59
|
hydroserverpy/api/services/sta/sensor.py,sha256=SmrIFNHD_vrlnbZvzsv0Wf0Pexk2oDWQ28LtWdj2kao,3274
|
|
42
60
|
hydroserverpy/api/services/sta/thing.py,sha256=Hyo3zTghSs7IIdsOGRu35i9w-aGOYlK9bl2AnmU4bBs,6666
|
|
43
61
|
hydroserverpy/api/services/sta/unit.py,sha256=NFToSAIGTwDfwYWe8Q-I_f5xsw_GYzFEkMnhSJ-ChvE,2178
|
|
44
|
-
hydroserverpy/etl/__init__.py,sha256=qK2m4LZl8czR3VE8SxrlipSy5tLGLNB60lxD7dD0GjU,659
|
|
45
|
-
hydroserverpy/etl/hydroserver_etl.py,sha256=FSdvM3T7QHEWWulWRT8t-FMHSxAGB4GvleUXtSk5IWc,1507
|
|
46
|
-
hydroserverpy/etl/timestamp_parser.py,sha256=MA_a0qPExbIQGt-ju7w6WflVDMzigW1LKUFCJ_jhkp4,4218
|
|
47
|
-
hydroserverpy/etl/types.py,sha256=4PY3CM-uoXIsf2lhcqtLC6HaRGXe7HKGDU22R8-H35c,135
|
|
48
|
-
hydroserverpy/etl/extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
49
|
-
hydroserverpy/etl/extractors/base.py,sha256=mK8WotEcG-4cHIW3ExS03wxyKtXPzcDhmo8S_5CGnek,1989
|
|
50
|
-
hydroserverpy/etl/extractors/ftp_extractor.py,sha256=5LwvHuvLk6LwRSVyE9EkV3DPgVlAvRrOBpl1a8B7dLg,1387
|
|
51
|
-
hydroserverpy/etl/extractors/http_extractor.py,sha256=WxWyg-GLyr6Rb-2uCFniWe6Nmk71x-frmxgEYTr9juU,814
|
|
52
|
-
hydroserverpy/etl/extractors/local_file_extractor.py,sha256=WZ4xIg5FiJ5GbVuR71Uj9tw_vVyzGYeweWctKscUSW0,563
|
|
53
|
-
hydroserverpy/etl/loaders/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
54
|
-
hydroserverpy/etl/loaders/base.py,sha256=q3pTp8NqZUYF1IxwKp7TOA5b4HuJkhz3FD9tIqpL7iM,273
|
|
55
|
-
hydroserverpy/etl/loaders/hydroserver_loader.py,sha256=N4zu_PefOwMr-NoFvq0g57VumYpNtD6o76oqhmF35ts,2545
|
|
56
|
-
hydroserverpy/etl/transformers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
57
|
-
hydroserverpy/etl/transformers/base.py,sha256=BtRNQItt6VY9r1TBMHByOTzOB1rY1QdY8ijqCgl0riI,2259
|
|
58
|
-
hydroserverpy/etl/transformers/csv_transformer.py,sha256=0kWfRKPwiGxCNZ87Q4SiBlfM3PuKL6upc1ljphBY89o,2891
|
|
59
|
-
hydroserverpy/etl/transformers/json_transformer.py,sha256=R7tSyDB4Wn1snP75ctbEDMaMCdjyhPnMzN_W2VV3Mv4,1506
|
|
60
|
-
hydroserverpy/etl_csv/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
61
|
-
hydroserverpy/etl_csv/exceptions.py,sha256=0UY8YUlNepG0y6FfH36hJyR1bOhwYHSZIdUSSMTg7GA,314
|
|
62
|
-
hydroserverpy/etl_csv/hydroserver_etl_csv.py,sha256=zZDIpbBTUdm4-9G3gJ8F_IqsLvP5wtGvr4Xy6_5K3tQ,14181
|
|
63
62
|
hydroserverpy/quality/__init__.py,sha256=GGBMkFSXciJLYrbV-NraFrj_mXWCy_GTcy9KKrKXU4c,84
|
|
64
63
|
hydroserverpy/quality/service.py,sha256=U02UfLKVmFvr5ySiH0n0JYzUIabq5uprrHIiwcqBlqY,13879
|
|
65
|
-
hydroserverpy-1.
|
|
66
|
-
hydroserverpy-1.
|
|
67
|
-
hydroserverpy-1.
|
|
68
|
-
hydroserverpy-1.
|
|
69
|
-
hydroserverpy-1.
|
|
70
|
-
hydroserverpy-1.
|
|
64
|
+
hydroserverpy-1.4.0b4.dist-info/licenses/LICENSE,sha256=xVqFxDw3QOEJukakL7gQCqIMTQ1dlSCTo6Oc1otNW80,1508
|
|
65
|
+
hydroserverpy-1.4.0b4.dist-info/METADATA,sha256=GCgulq3Im1uhFlRJtRg54dwOIqHK3wFfxAXpT6hlDlA,532
|
|
66
|
+
hydroserverpy-1.4.0b4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
67
|
+
hydroserverpy-1.4.0b4.dist-info/top_level.txt,sha256=Zf37hrncXLOYvXhgCrf5mZdeq81G9fShdE2LfYbtb7w,14
|
|
68
|
+
hydroserverpy-1.4.0b4.dist-info/zip-safe,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
|
|
69
|
+
hydroserverpy-1.4.0b4.dist-info/RECORD,,
|
hydroserverpy/etl/__init__.py
DELETED
|
@@ -1,21 +0,0 @@
|
|
|
1
|
-
from .extractors.local_file_extractor import LocalFileExtractor
|
|
2
|
-
from .extractors.ftp_extractor import FTPExtractor
|
|
3
|
-
from .extractors.http_extractor import HTTPExtractor
|
|
4
|
-
from .transformers.csv_transformer import CSVTransformer
|
|
5
|
-
from .transformers.json_transformer import JSONTransformer
|
|
6
|
-
from .transformers.base import Transformer
|
|
7
|
-
from .extractors.base import Extractor
|
|
8
|
-
from .loaders.base import Loader
|
|
9
|
-
from .loaders.hydroserver_loader import HydroServerLoader
|
|
10
|
-
|
|
11
|
-
__all__ = [
|
|
12
|
-
"CSVTransformer",
|
|
13
|
-
"JSONTransformer",
|
|
14
|
-
"LocalFileExtractor",
|
|
15
|
-
"FTPExtractor",
|
|
16
|
-
"HTTPExtractor",
|
|
17
|
-
"Extractor",
|
|
18
|
-
"Transformer",
|
|
19
|
-
"Loader",
|
|
20
|
-
"HydroServerLoader",
|
|
21
|
-
]
|
|
File without changes
|
|
@@ -1,19 +0,0 @@
|
|
|
1
|
-
import logging
|
|
2
|
-
from .base import Extractor
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class LocalFileExtractor(Extractor):
|
|
6
|
-
def __init__(self, settings: object):
|
|
7
|
-
super().__init__(settings)
|
|
8
|
-
|
|
9
|
-
def extract(self):
|
|
10
|
-
"""
|
|
11
|
-
Opens the file and returns a file-like object.
|
|
12
|
-
"""
|
|
13
|
-
try:
|
|
14
|
-
file_handle = open(self.source_uri, "r")
|
|
15
|
-
logging.info(f"Successfully opened file '{self.source_uri}'.")
|
|
16
|
-
return file_handle
|
|
17
|
-
except Exception as e:
|
|
18
|
-
logging.error(f"Error opening file '{self.source_uri}': {e}")
|
|
19
|
-
return None
|
|
@@ -1,40 +0,0 @@
|
|
|
1
|
-
import logging
|
|
2
|
-
import pandas as pd
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class HydroServerETL:
|
|
6
|
-
def __init__(self, extractor, transformer, loader, source_target_map):
|
|
7
|
-
self.extractor = extractor
|
|
8
|
-
self.transformer = transformer
|
|
9
|
-
self.loader = loader
|
|
10
|
-
self.source_target_map = source_target_map
|
|
11
|
-
|
|
12
|
-
def run(self):
|
|
13
|
-
"""
|
|
14
|
-
Extracts, transforms, and loads data as defined by the class parameters.
|
|
15
|
-
"""
|
|
16
|
-
|
|
17
|
-
# Step 1: Get Target System data requirements from the Loader & prepare parameters for the Extractor
|
|
18
|
-
data_requirements = self.loader.get_data_requirements(self.source_target_map)
|
|
19
|
-
self.extractor.prepare_params(data_requirements)
|
|
20
|
-
|
|
21
|
-
# Step 2: Extract
|
|
22
|
-
data = self.extractor.extract()
|
|
23
|
-
if data is None or (isinstance(data, pd.DataFrame) and data.empty):
|
|
24
|
-
logging.warning(f"No data was returned from the extractor. Ending ETL run.")
|
|
25
|
-
return
|
|
26
|
-
else:
|
|
27
|
-
logging.info(f"Successfully extracted data.")
|
|
28
|
-
|
|
29
|
-
# Step 3: Transform
|
|
30
|
-
if self.transformer:
|
|
31
|
-
data = self.transformer.transform(data)
|
|
32
|
-
if data is None or (isinstance(data, pd.DataFrame) and data.empty):
|
|
33
|
-
logging.warning(f"No data returned from the transformer. Ending run.")
|
|
34
|
-
return
|
|
35
|
-
else:
|
|
36
|
-
logging.info(f"Successfully transformed data. {data}")
|
|
37
|
-
|
|
38
|
-
# Step 4: Load
|
|
39
|
-
self.loader.load(data, self.source_target_map)
|
|
40
|
-
logging.info("Successfully loaded data.")
|
|
File without changes
|