hydroserverpy 1.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hydroserverpy/__init__.py +7 -0
- hydroserverpy/api/__init__.py +0 -0
- hydroserverpy/api/client.py +203 -0
- hydroserverpy/api/models/__init__.py +22 -0
- hydroserverpy/api/models/base.py +207 -0
- hydroserverpy/api/models/etl/__init__.py +26 -0
- hydroserverpy/api/models/etl/data_archive.py +77 -0
- hydroserverpy/api/models/etl/data_source.py +146 -0
- hydroserverpy/api/models/etl/etl_configuration.py +224 -0
- hydroserverpy/api/models/etl/extractors/__init__.py +6 -0
- hydroserverpy/api/models/etl/extractors/base.py +52 -0
- hydroserverpy/api/models/etl/extractors/ftp_extractor.py +50 -0
- hydroserverpy/api/models/etl/extractors/http_extractor.py +28 -0
- hydroserverpy/api/models/etl/extractors/local_file_extractor.py +20 -0
- hydroserverpy/api/models/etl/factories.py +23 -0
- hydroserverpy/api/models/etl/loaders/__init__.py +4 -0
- hydroserverpy/api/models/etl/loaders/base.py +11 -0
- hydroserverpy/api/models/etl/loaders/hydroserver_loader.py +98 -0
- hydroserverpy/api/models/etl/orchestration_configuration.py +35 -0
- hydroserverpy/api/models/etl/orchestration_system.py +63 -0
- hydroserverpy/api/models/etl/schedule.py +16 -0
- hydroserverpy/api/models/etl/status.py +14 -0
- hydroserverpy/api/models/etl/timestamp_parser.py +112 -0
- hydroserverpy/api/models/etl/transformers/__init__.py +5 -0
- hydroserverpy/api/models/etl/transformers/base.py +135 -0
- hydroserverpy/api/models/etl/transformers/csv_transformer.py +88 -0
- hydroserverpy/api/models/etl/transformers/json_transformer.py +48 -0
- hydroserverpy/api/models/etl/types.py +7 -0
- hydroserverpy/api/models/iam/__init__.py +0 -0
- hydroserverpy/api/models/iam/account.py +12 -0
- hydroserverpy/api/models/iam/apikey.py +96 -0
- hydroserverpy/api/models/iam/collaborator.py +70 -0
- hydroserverpy/api/models/iam/role.py +38 -0
- hydroserverpy/api/models/iam/workspace.py +297 -0
- hydroserverpy/api/models/sta/__init__.py +0 -0
- hydroserverpy/api/models/sta/datastream.py +254 -0
- hydroserverpy/api/models/sta/observation.py +103 -0
- hydroserverpy/api/models/sta/observed_property.py +37 -0
- hydroserverpy/api/models/sta/processing_level.py +35 -0
- hydroserverpy/api/models/sta/result_qualifier.py +34 -0
- hydroserverpy/api/models/sta/sensor.py +44 -0
- hydroserverpy/api/models/sta/thing.py +113 -0
- hydroserverpy/api/models/sta/unit.py +36 -0
- hydroserverpy/api/services/__init__.py +12 -0
- hydroserverpy/api/services/base.py +118 -0
- hydroserverpy/api/services/etl/__init__.py +0 -0
- hydroserverpy/api/services/etl/data_archive.py +166 -0
- hydroserverpy/api/services/etl/data_source.py +163 -0
- hydroserverpy/api/services/etl/orchestration_system.py +66 -0
- hydroserverpy/api/services/iam/__init__.py +0 -0
- hydroserverpy/api/services/iam/role.py +38 -0
- hydroserverpy/api/services/iam/workspace.py +232 -0
- hydroserverpy/api/services/sta/__init__.py +0 -0
- hydroserverpy/api/services/sta/datastream.py +296 -0
- hydroserverpy/api/services/sta/observed_property.py +82 -0
- hydroserverpy/api/services/sta/processing_level.py +72 -0
- hydroserverpy/api/services/sta/result_qualifier.py +64 -0
- hydroserverpy/api/services/sta/sensor.py +102 -0
- hydroserverpy/api/services/sta/thing.py +195 -0
- hydroserverpy/api/services/sta/unit.py +78 -0
- hydroserverpy/api/utils.py +22 -0
- hydroserverpy/quality/__init__.py +1 -0
- hydroserverpy/quality/service.py +405 -0
- hydroserverpy-1.5.1.dist-info/METADATA +66 -0
- hydroserverpy-1.5.1.dist-info/RECORD +69 -0
- hydroserverpy-1.5.1.dist-info/WHEEL +5 -0
- hydroserverpy-1.5.1.dist-info/licenses/LICENSE +28 -0
- hydroserverpy-1.5.1.dist-info/top_level.txt +1 -0
- hydroserverpy-1.5.1.dist-info/zip-safe +1 -0
|
@@ -0,0 +1,224 @@
|
|
|
1
|
+
from typing import Annotated, Dict, List, Literal, Optional, Union
|
|
2
|
+
from pydantic import BaseModel, Field, field_validator
|
|
3
|
+
from enum import Enum
|
|
4
|
+
|
|
5
|
+
WorkflowType = Literal["ETL", "Aggregation", "Virtual", "SDL"]
|
|
6
|
+
CSVDelimiterType = Literal[",", "|", "\t", ";", " "]
|
|
7
|
+
ExtractorType = Literal["HTTP", "local"]
|
|
8
|
+
TransformerType = Literal["JSON", "CSV"]
|
|
9
|
+
LoaderType = Literal["HydroServer"]
|
|
10
|
+
IdentifierType = Literal["name", "index"]
|
|
11
|
+
RunTimeValue = Literal["jobExecutionTime", "latestObservationTimestamp"]
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class FixedOffsetTimezone(str, Enum):
|
|
15
|
+
UTC_MINUS_1200 = "-1200"
|
|
16
|
+
UTC_MINUS_1100 = "-1100"
|
|
17
|
+
UTC_MINUS_1000 = "-1000"
|
|
18
|
+
UTC_MINUS_0900 = "-0900"
|
|
19
|
+
UTC_MINUS_0800 = "-0800"
|
|
20
|
+
UTC_MINUS_0700 = "-0700"
|
|
21
|
+
UTC_MINUS_0600 = "-0600"
|
|
22
|
+
UTC_MINUS_0500 = "-0500"
|
|
23
|
+
UTC_MINUS_0430 = "-0430"
|
|
24
|
+
UTC_MINUS_0400 = "-0400"
|
|
25
|
+
UTC_MINUS_0330 = "-0330"
|
|
26
|
+
UTC_MINUS_0300 = "-0300"
|
|
27
|
+
UTC_MINUS_0200 = "-0200"
|
|
28
|
+
UTC_MINUS_0100 = "-0100"
|
|
29
|
+
UTC_PLUS_0000 = "+0000"
|
|
30
|
+
UTC_PLUS_0100 = "+0100"
|
|
31
|
+
UTC_PLUS_0200 = "+0200"
|
|
32
|
+
UTC_PLUS_0300 = "+0300"
|
|
33
|
+
UTC_PLUS_0330 = "+0330"
|
|
34
|
+
UTC_PLUS_0400 = "+0400"
|
|
35
|
+
UTC_PLUS_0430 = "+0430"
|
|
36
|
+
UTC_PLUS_0500 = "+0500"
|
|
37
|
+
UTC_PLUS_0530 = "+0530"
|
|
38
|
+
UTC_PLUS_0545 = "+0545"
|
|
39
|
+
UTC_PLUS_0600 = "+0600"
|
|
40
|
+
UTC_PLUS_0630 = "+0630"
|
|
41
|
+
UTC_PLUS_0700 = "+0700"
|
|
42
|
+
UTC_PLUS_0800 = "+0800"
|
|
43
|
+
UTC_PLUS_0845 = "+0845"
|
|
44
|
+
UTC_PLUS_0900 = "+0900"
|
|
45
|
+
UTC_PLUS_0930 = "+0930"
|
|
46
|
+
UTC_PLUS_1000 = "+1000"
|
|
47
|
+
UTC_PLUS_1030 = "+1030"
|
|
48
|
+
UTC_PLUS_1100 = "+1100"
|
|
49
|
+
UTC_PLUS_1130 = "+1130"
|
|
50
|
+
UTC_PLUS_1200 = "+1200"
|
|
51
|
+
UTC_PLUS_1245 = "+1245"
|
|
52
|
+
UTC_PLUS_1300 = "+1300"
|
|
53
|
+
UTC_PLUS_1400 = "+1400"
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class TimestampFormat(str, Enum):
|
|
57
|
+
ISO8601 = "ISO8601"
|
|
58
|
+
naive = "naive"
|
|
59
|
+
custom = "custom"
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class TimezoneMode(str, Enum):
|
|
63
|
+
utc = "utc" # always UTC
|
|
64
|
+
daylightSavings = "daylightSavings" # IANA / DST-aware
|
|
65
|
+
fixedOffset = "fixedOffset" # constant offset
|
|
66
|
+
embeddedOffset = "embeddedOffset" # offset in ISO string
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
class Timestamp(BaseModel):
|
|
70
|
+
key: Optional[str] = None
|
|
71
|
+
format: TimestampFormat
|
|
72
|
+
custom_format: Optional[str] = Field(None, alias="customFormat")
|
|
73
|
+
timezone_mode: TimezoneMode = Field(..., alias="timezoneMode")
|
|
74
|
+
timezone: Optional[Union[FixedOffsetTimezone, str]] = Field(None, alias="timezone")
|
|
75
|
+
|
|
76
|
+
class Config:
|
|
77
|
+
populate_by_name = True
|
|
78
|
+
|
|
79
|
+
@field_validator("timezone")
|
|
80
|
+
def check_timezone(cls, timezone_value, info):
|
|
81
|
+
mode = info.data.get("timezone_mode")
|
|
82
|
+
if mode == TimezoneMode.fixedOffset and timezone_value is None:
|
|
83
|
+
raise ValueError("`timezone` must be set when timezoneMode is fixedOffset")
|
|
84
|
+
return timezone_value
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
class PerPayloadPlaceholder(BaseModel):
|
|
88
|
+
name: str
|
|
89
|
+
type: Literal["perPayload"]
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
class RunTimePlaceholder(BaseModel):
|
|
93
|
+
name: str
|
|
94
|
+
type: Literal["runTime"]
|
|
95
|
+
run_time_value: RunTimeValue = Field(..., alias="runTimeValue")
|
|
96
|
+
timestamp: Timestamp
|
|
97
|
+
|
|
98
|
+
class Config:
|
|
99
|
+
populate_by_name = True
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
PlaceholderVariable = Annotated[
|
|
103
|
+
Union[PerPayloadPlaceholder, RunTimePlaceholder],
|
|
104
|
+
Field(discriminator="type"),
|
|
105
|
+
]
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
class BaseExtractor(BaseModel):
|
|
109
|
+
type: ExtractorType
|
|
110
|
+
source_uri: str = Field(..., alias="sourceUri")
|
|
111
|
+
placeholder_variables: Optional[List[PlaceholderVariable]] = Field(
|
|
112
|
+
default_factory=list,
|
|
113
|
+
alias="placeholderVariables",
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
class Config:
|
|
117
|
+
populate_by_name = True
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
class HTTPExtractor(BaseExtractor):
|
|
121
|
+
type: Literal["HTTP"]
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
class LocalFileExtractor(BaseExtractor):
|
|
125
|
+
type: Literal["local"]
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
ExtractorConfig = Annotated[
|
|
129
|
+
Union[HTTPExtractor, LocalFileExtractor], Field(discriminator="type")
|
|
130
|
+
]
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
class BaseTransformer(BaseModel):
|
|
134
|
+
type: TransformerType
|
|
135
|
+
timestamp: Timestamp
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
class JSONTransformer(BaseTransformer):
|
|
139
|
+
type: Literal["JSON"]
|
|
140
|
+
jmespath: str = Field(..., alias="JMESPath")
|
|
141
|
+
|
|
142
|
+
class Config:
|
|
143
|
+
populate_by_name = True
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
class CSVTransformer(BaseTransformer):
|
|
147
|
+
type: Literal["CSV"]
|
|
148
|
+
header_row: Optional[int] = Field(..., alias="headerRow")
|
|
149
|
+
data_start_row: int = Field(..., alias="dataStartRow")
|
|
150
|
+
delimiter: CSVDelimiterType
|
|
151
|
+
identifier_type: IdentifierType = Field(..., alias="identifierType")
|
|
152
|
+
|
|
153
|
+
class Config:
|
|
154
|
+
populate_by_name = True
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
TransformerConfig = Union[JSONTransformer, CSVTransformer]
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
class BaseLoaderConfig(BaseModel):
|
|
161
|
+
type: LoaderType
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
class HydroServerLoaderConfig(BaseLoaderConfig):
|
|
165
|
+
type: Literal["HydroServer"]
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
LoaderConfig = HydroServerLoaderConfig
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
class ExpressionDataTransformation(BaseModel):
|
|
172
|
+
type: Literal["expression"]
|
|
173
|
+
expression: str
|
|
174
|
+
|
|
175
|
+
class Config:
|
|
176
|
+
populate_by_name = True
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
class LookupTableDataTransformation(BaseModel):
|
|
180
|
+
type: Literal["lookup"]
|
|
181
|
+
lookup_table_id: str = Field(..., alias="lookupTableId")
|
|
182
|
+
|
|
183
|
+
class Config:
|
|
184
|
+
populate_by_name = True
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
DataTransformation = Union[ExpressionDataTransformation, LookupTableDataTransformation]
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
class MappingPath(BaseModel):
|
|
191
|
+
target_identifier: Union[str, int] = Field(..., alias="targetIdentifier")
|
|
192
|
+
data_transformations: List[DataTransformation] = Field(
|
|
193
|
+
default_factory=list, alias="dataTransformations"
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
class Config:
|
|
197
|
+
populate_by_name = True
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
class SourceTargetMapping(BaseModel):
|
|
201
|
+
source_identifier: Union[str, int] = Field(..., alias="sourceIdentifier")
|
|
202
|
+
paths: List[MappingPath] = Field(default_factory=list)
|
|
203
|
+
|
|
204
|
+
class Config:
|
|
205
|
+
populate_by_name = True
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
class Payload(BaseModel):
|
|
209
|
+
name: str = ""
|
|
210
|
+
mappings: List[SourceTargetMapping] = Field(default_factory=list)
|
|
211
|
+
extractor_variables: Dict[str, str] = Field(
|
|
212
|
+
default_factory=dict, alias="extractorVariables"
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
class Config:
|
|
216
|
+
populate_by_name = True
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
class EtlConfiguration(BaseModel):
|
|
220
|
+
type: WorkflowType
|
|
221
|
+
extractor: ExtractorConfig
|
|
222
|
+
transformer: TransformerConfig
|
|
223
|
+
loader: LoaderConfig
|
|
224
|
+
payloads: List[Payload]
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
from abc import abstractmethod
|
|
2
|
+
import logging
|
|
3
|
+
import pandas as pd
|
|
4
|
+
from datetime import datetime
|
|
5
|
+
from ..etl_configuration import ExtractorConfig, Payload
|
|
6
|
+
from ..timestamp_parser import TimestampParser
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class Extractor:
|
|
10
|
+
def __init__(self, extractor_config: ExtractorConfig):
|
|
11
|
+
self.cfg = extractor_config
|
|
12
|
+
|
|
13
|
+
def resolve_placeholder_variables(self, payload: Payload, loader):
|
|
14
|
+
logging.info(f"Creating runtime variables...")
|
|
15
|
+
filled = {}
|
|
16
|
+
for placeholder in self.cfg.placeholder_variables:
|
|
17
|
+
name = placeholder.name
|
|
18
|
+
|
|
19
|
+
if placeholder.type == "runTime":
|
|
20
|
+
logging.info(f"Resolving runtime var: {name}")
|
|
21
|
+
if placeholder.run_time_value == "latestObservationTimestamp":
|
|
22
|
+
value = loader.earliest_begin_date(payload)
|
|
23
|
+
elif placeholder.run_time_value == "jobExecutionTime":
|
|
24
|
+
value = pd.Timestamp.now(tz="UTC")
|
|
25
|
+
elif placeholder.type == "perPayload":
|
|
26
|
+
logging.info(f"Resolving payload var: {name}")
|
|
27
|
+
if name not in payload.extractor_variables:
|
|
28
|
+
raise KeyError(f"Missing per-payload variable '{name}'")
|
|
29
|
+
value = payload.extractor_variables[name]
|
|
30
|
+
else:
|
|
31
|
+
continue
|
|
32
|
+
|
|
33
|
+
if isinstance(value, (datetime, pd.Timestamp)):
|
|
34
|
+
parser = TimestampParser(placeholder.timestamp)
|
|
35
|
+
value = parser.utc_to_string(value)
|
|
36
|
+
|
|
37
|
+
filled[name] = value
|
|
38
|
+
if not filled:
|
|
39
|
+
return self.cfg.source_uri
|
|
40
|
+
return self.format_uri(filled)
|
|
41
|
+
|
|
42
|
+
def format_uri(self, placeholder_variables):
|
|
43
|
+
try:
|
|
44
|
+
uri = self.cfg.source_uri.format(**placeholder_variables)
|
|
45
|
+
except KeyError as e:
|
|
46
|
+
missing_key = e.args[0]
|
|
47
|
+
raise KeyError(f"Missing placeholder variable: {missing_key}")
|
|
48
|
+
return uri
|
|
49
|
+
|
|
50
|
+
@abstractmethod
|
|
51
|
+
def extract(self):
|
|
52
|
+
pass
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from ftplib import FTP
|
|
3
|
+
from io import BytesIO
|
|
4
|
+
from typing import Dict
|
|
5
|
+
|
|
6
|
+
from .base import Extractor
|
|
7
|
+
from ..types import TimeRange
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class FTPExtractor(Extractor):
|
|
11
|
+
def __init__(
|
|
12
|
+
self,
|
|
13
|
+
host: str,
|
|
14
|
+
filepath: str,
|
|
15
|
+
username: str = None,
|
|
16
|
+
password: str = None,
|
|
17
|
+
port: int = 21,
|
|
18
|
+
):
|
|
19
|
+
self.host = host
|
|
20
|
+
self.port = int(port)
|
|
21
|
+
self.username = username
|
|
22
|
+
self.password = password
|
|
23
|
+
self.filepath = filepath
|
|
24
|
+
|
|
25
|
+
def prepare_params(self, data_requirements: Dict[str, TimeRange]):
|
|
26
|
+
pass
|
|
27
|
+
|
|
28
|
+
def extract(self):
|
|
29
|
+
"""
|
|
30
|
+
Downloads the file from the FTP server and returns a file-like object.
|
|
31
|
+
"""
|
|
32
|
+
ftp = FTP()
|
|
33
|
+
try:
|
|
34
|
+
ftp.connect(self.host, self.port)
|
|
35
|
+
ftp.login(user=self.username, passwd=self.password)
|
|
36
|
+
logging.info(f"Connected to FTP server: {self.host}:{self.port}")
|
|
37
|
+
|
|
38
|
+
data = BytesIO()
|
|
39
|
+
ftp.retrbinary(f"RETR {self.filepath}", data.write)
|
|
40
|
+
logging.info(
|
|
41
|
+
f"Successfully downloaded file '{self.filepath}' from FTP server."
|
|
42
|
+
)
|
|
43
|
+
data.seek(0)
|
|
44
|
+
return data
|
|
45
|
+
except Exception as e:
|
|
46
|
+
logging.error(f"Error retrieving file from FTP server: {e}")
|
|
47
|
+
return None
|
|
48
|
+
finally:
|
|
49
|
+
if ftp:
|
|
50
|
+
ftp.quit()
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import requests
|
|
3
|
+
from io import BytesIO
|
|
4
|
+
|
|
5
|
+
from ..etl_configuration import Payload
|
|
6
|
+
from .base import Extractor, ExtractorConfig
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class HTTPExtractor(Extractor):
|
|
10
|
+
def __init__(self, settings: ExtractorConfig):
|
|
11
|
+
super().__init__(settings)
|
|
12
|
+
|
|
13
|
+
def extract(self, payload: Payload, loader=None):
|
|
14
|
+
"""
|
|
15
|
+
Downloads the file from the HTTP/HTTPS server and returns a file-like object.
|
|
16
|
+
"""
|
|
17
|
+
url = self.resolve_placeholder_variables(payload, loader)
|
|
18
|
+
logging.info(f"Requesting data from → {url}")
|
|
19
|
+
|
|
20
|
+
response = requests.get(url)
|
|
21
|
+
response.raise_for_status()
|
|
22
|
+
|
|
23
|
+
data = BytesIO()
|
|
24
|
+
for chunk in response.iter_content(chunk_size=8192):
|
|
25
|
+
if chunk:
|
|
26
|
+
data.write(chunk)
|
|
27
|
+
data.seek(0)
|
|
28
|
+
return data
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from .base import Extractor
|
|
3
|
+
from ..etl_configuration import ExtractorConfig
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class LocalFileExtractor(Extractor):
|
|
7
|
+
def __init__(self, extractor_config: ExtractorConfig):
|
|
8
|
+
super().__init__(extractor_config)
|
|
9
|
+
|
|
10
|
+
def extract(self, *args, **kwargs):
|
|
11
|
+
"""
|
|
12
|
+
Opens the file and returns a file-like object.
|
|
13
|
+
"""
|
|
14
|
+
try:
|
|
15
|
+
file_handle = open(self.cfg.source_uri, "r")
|
|
16
|
+
logging.info(f"Successfully opened file '{self.cfg.source_uri}'.")
|
|
17
|
+
return file_handle
|
|
18
|
+
except Exception as e:
|
|
19
|
+
logging.error(f"Error opening file '{self.cfg.source_uri}': {e}")
|
|
20
|
+
return None
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
from .extractors import HTTPExtractor, LocalFileExtractor
|
|
2
|
+
from .transformers import JSONTransformer, CSVTransformer
|
|
3
|
+
from .loaders import HydroServerLoader
|
|
4
|
+
from .etl_configuration import ExtractorConfig, TransformerConfig, LoaderConfig
|
|
5
|
+
|
|
6
|
+
EXTRACTORS = {"HTTP": HTTPExtractor, "local": LocalFileExtractor}
|
|
7
|
+
TRANSFORMERS = {"JSON": JSONTransformer, "CSV": CSVTransformer}
|
|
8
|
+
LOADERS = {"HydroServer": HydroServerLoader}
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def extractor_factory(settings: ExtractorConfig):
|
|
12
|
+
cls = EXTRACTORS[settings.type]
|
|
13
|
+
return cls(settings)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def transformer_factory(settings: TransformerConfig):
|
|
17
|
+
cls = TRANSFORMERS[settings.type]
|
|
18
|
+
return cls(settings)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def loader_factory(settings: LoaderConfig, auth_context, data_source_id: str):
|
|
22
|
+
cls = LOADERS[settings.type]
|
|
23
|
+
return cls(auth_context, data_source_id)
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from typing import TYPE_CHECKING
|
|
3
|
+
|
|
4
|
+
from .base import Loader
|
|
5
|
+
import logging
|
|
6
|
+
import pandas as pd
|
|
7
|
+
from ..etl_configuration import Payload, SourceTargetMapping
|
|
8
|
+
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
from hydroserverpy.api.client import HydroServer
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class HydroServerLoader(Loader):
|
|
14
|
+
"""
|
|
15
|
+
A class that extends the HydroServer client with ETL-specific functionalities.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
def __init__(self, client: HydroServer, data_source_id):
|
|
19
|
+
self.client = client
|
|
20
|
+
self._begin_cache: dict[str, pd.Timestamp] = {}
|
|
21
|
+
self.data_source_id = data_source_id
|
|
22
|
+
|
|
23
|
+
def load(self, data: pd.DataFrame, payload: Payload) -> None:
|
|
24
|
+
"""
|
|
25
|
+
Load observations from a DataFrame to the HydroServer.
|
|
26
|
+
:param data: A Pandas DataFrame where each column corresponds to a datastream.
|
|
27
|
+
"""
|
|
28
|
+
begin_date = self.earliest_begin_date(payload)
|
|
29
|
+
new_data = data[data["timestamp"] > begin_date]
|
|
30
|
+
for col in new_data.columns.difference(["timestamp"]):
|
|
31
|
+
df = (
|
|
32
|
+
new_data[["timestamp", col]]
|
|
33
|
+
.rename(columns={col: "value"})
|
|
34
|
+
.dropna(subset=["value"])
|
|
35
|
+
)
|
|
36
|
+
if df.empty:
|
|
37
|
+
logging.warning(f"No new data for {col}, skipping.")
|
|
38
|
+
continue
|
|
39
|
+
|
|
40
|
+
df = df.rename(columns={"timestamp": "phenomenon_time", "value": "result"})
|
|
41
|
+
|
|
42
|
+
# Chunked upload
|
|
43
|
+
CHUNK_SIZE = 5000
|
|
44
|
+
total = len(df)
|
|
45
|
+
for start in range(0, total, CHUNK_SIZE):
|
|
46
|
+
end = min(start + CHUNK_SIZE, total)
|
|
47
|
+
chunk = df.iloc[start:end]
|
|
48
|
+
logging.info(
|
|
49
|
+
"Uploading %s rows (%s-%s) to datastream %s",
|
|
50
|
+
len(chunk),
|
|
51
|
+
start,
|
|
52
|
+
end - 1,
|
|
53
|
+
col,
|
|
54
|
+
)
|
|
55
|
+
try:
|
|
56
|
+
self.client.datastreams.load_observations(
|
|
57
|
+
uid=str(col), observations=chunk
|
|
58
|
+
)
|
|
59
|
+
except Exception as e:
|
|
60
|
+
status = getattr(e, "status_code", None) or getattr(
|
|
61
|
+
getattr(e, "response", None), "status_code", None
|
|
62
|
+
)
|
|
63
|
+
if status == 409 or "409" in str(e) or "Conflict" in str(e):
|
|
64
|
+
logging.info(
|
|
65
|
+
"409 Conflict for datastream %s on rows %s-%s; skipping remainder for this stream.",
|
|
66
|
+
col,
|
|
67
|
+
start,
|
|
68
|
+
end - 1,
|
|
69
|
+
)
|
|
70
|
+
break
|
|
71
|
+
raise
|
|
72
|
+
|
|
73
|
+
def _fetch_earliest_begin(
|
|
74
|
+
self, mappings: list[SourceTargetMapping]
|
|
75
|
+
) -> pd.Timestamp:
|
|
76
|
+
logging.info("Querying HydroServer for earliest begin date for payload...")
|
|
77
|
+
timestamps = []
|
|
78
|
+
datastreams = self.client.datastreams.list(
|
|
79
|
+
data_source=self.data_source_id
|
|
80
|
+
).items
|
|
81
|
+
ds_by_uid = {str(ds.uid): ds for ds in datastreams}
|
|
82
|
+
for m in mappings:
|
|
83
|
+
for p in m.paths:
|
|
84
|
+
datastream = ds_by_uid[str(p.target_identifier)]
|
|
85
|
+
raw = datastream.phenomenon_end_time or "1970-01-01"
|
|
86
|
+
ts = pd.to_datetime(raw, utc=True)
|
|
87
|
+
timestamps.append(ts)
|
|
88
|
+
logging.info(f"Found earliest begin date: {min(timestamps)}")
|
|
89
|
+
return min(timestamps)
|
|
90
|
+
|
|
91
|
+
def earliest_begin_date(self, payload: Payload) -> pd.Timestamp:
|
|
92
|
+
"""
|
|
93
|
+
Return earliest begin date for a payload, or compute+cache it on first call.
|
|
94
|
+
"""
|
|
95
|
+
key = payload.name
|
|
96
|
+
if key not in self._begin_cache:
|
|
97
|
+
self._begin_cache[key] = self._fetch_earliest_begin(payload.mappings)
|
|
98
|
+
return self._begin_cache[key]
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
from pydantic import AliasPath
|
|
2
|
+
from typing import Optional, Literal
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
from pydantic import BaseModel, Field
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class OrchestrationConfigurationFields(BaseModel):
|
|
8
|
+
interval: Optional[int] = Field(
|
|
9
|
+
None, gt=0, validation_alias=AliasPath("schedule", "interval")
|
|
10
|
+
)
|
|
11
|
+
interval_units: Optional[Literal["minutes", "hours", "days"]] = Field(
|
|
12
|
+
None, validation_alias=AliasPath("schedule", "intervalUnits")
|
|
13
|
+
)
|
|
14
|
+
crontab: Optional[str] = Field(
|
|
15
|
+
None, max_length=255, validation_alias=AliasPath("schedule", "crontab")
|
|
16
|
+
)
|
|
17
|
+
start_time: Optional[datetime] = Field(
|
|
18
|
+
None, validation_alias=AliasPath("schedule", "startTime")
|
|
19
|
+
)
|
|
20
|
+
end_time: Optional[datetime] = Field(
|
|
21
|
+
None, validation_alias=AliasPath("schedule", "endTime")
|
|
22
|
+
)
|
|
23
|
+
last_run_successful: Optional[bool] = Field(
|
|
24
|
+
None, validation_alias=AliasPath("status", "lastRunSuccessful")
|
|
25
|
+
)
|
|
26
|
+
last_run_message: Optional[str] = Field(
|
|
27
|
+
None, max_length=255, validation_alias=AliasPath("status", "lastRunMessage")
|
|
28
|
+
)
|
|
29
|
+
last_run: Optional[datetime] = Field(
|
|
30
|
+
None, validation_alias=AliasPath("status", "lastRun")
|
|
31
|
+
)
|
|
32
|
+
next_run: Optional[datetime] = Field(
|
|
33
|
+
None, validation_alias=AliasPath("status", "nextRun")
|
|
34
|
+
)
|
|
35
|
+
paused: bool = Field(False, validation_alias=AliasPath("status", "paused"))
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
import uuid
|
|
2
|
+
from typing import Optional, ClassVar, List, TYPE_CHECKING
|
|
3
|
+
from pydantic import BaseModel, Field
|
|
4
|
+
from ..base import HydroServerBaseModel
|
|
5
|
+
|
|
6
|
+
if TYPE_CHECKING:
|
|
7
|
+
from hydroserverpy import HydroServer
|
|
8
|
+
from hydroserverpy.api.models import Workspace, DataSource, DataArchive
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class OrchestrationSystemFields(BaseModel):
|
|
12
|
+
name: str = Field(..., max_length=255)
|
|
13
|
+
orchestration_system_type: str = Field(..., max_length=255, alias="type")
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class OrchestrationSystem(HydroServerBaseModel):
|
|
17
|
+
name: str = Field(..., max_length=255)
|
|
18
|
+
orchestration_system_type: str = Field(..., max_length=255, alias="type")
|
|
19
|
+
workspace_id: Optional[uuid.UUID] = None
|
|
20
|
+
|
|
21
|
+
_editable_fields: ClassVar[set[str]] = {"name", "orchestration_system_type"}
|
|
22
|
+
|
|
23
|
+
def __init__(self, client: "HydroServer", **data):
|
|
24
|
+
super().__init__(client=client, service=client.orchestrationsystems, **data)
|
|
25
|
+
|
|
26
|
+
self._workspace = None
|
|
27
|
+
self._datasources = None
|
|
28
|
+
self._dataarchives = None
|
|
29
|
+
|
|
30
|
+
@classmethod
|
|
31
|
+
def get_route(cls):
|
|
32
|
+
return "orchestration-systems"
|
|
33
|
+
|
|
34
|
+
@property
|
|
35
|
+
def workspace(self) -> "Workspace":
|
|
36
|
+
"""The workspace this orchestration system belongs to."""
|
|
37
|
+
|
|
38
|
+
if self._workspace is None and self.workspace_id:
|
|
39
|
+
self._workspace = self.client.workspaces.get(uid=self.workspace_id)
|
|
40
|
+
|
|
41
|
+
return self._workspace
|
|
42
|
+
|
|
43
|
+
@property
|
|
44
|
+
def datasources(self) -> List["DataSource"]:
|
|
45
|
+
"""The data sources associated with this workspace."""
|
|
46
|
+
|
|
47
|
+
if self._datasources is None:
|
|
48
|
+
self._datasources = self.client.datasources.list(
|
|
49
|
+
orchestration_system=self.uid, fetch_all=True
|
|
50
|
+
).items
|
|
51
|
+
|
|
52
|
+
return self._datasources
|
|
53
|
+
|
|
54
|
+
@property
|
|
55
|
+
def dataarchives(self) -> List["DataArchive"]:
|
|
56
|
+
"""The data archives associated with this workspace."""
|
|
57
|
+
|
|
58
|
+
if self._dataarchives is None:
|
|
59
|
+
self._dataarchives = self.client.dataarchives.list(
|
|
60
|
+
orchestration_system=self.uid, fetch_all=True
|
|
61
|
+
).items
|
|
62
|
+
|
|
63
|
+
return self._dataarchives
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
from datetime import datetime
|
|
2
|
+
from typing import Literal, Optional
|
|
3
|
+
from pydantic import BaseModel, Field
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class Schedule(BaseModel):
|
|
7
|
+
interval: int = Field(..., gt=0)
|
|
8
|
+
interval_units: Optional[Literal["minutes", "hours", "days"]] = Field(
|
|
9
|
+
None, alias="intervalUnits"
|
|
10
|
+
)
|
|
11
|
+
crontab: Optional[str]
|
|
12
|
+
start_time: Optional[datetime] = Field(None, alias="startTime")
|
|
13
|
+
end_time: Optional[datetime] = Field(None, alias="endTime")
|
|
14
|
+
|
|
15
|
+
class Config:
|
|
16
|
+
populate_by_name = True
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
from datetime import datetime
|
|
2
|
+
from typing import Optional
|
|
3
|
+
from pydantic import BaseModel, Field
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class Status(BaseModel):
|
|
7
|
+
paused: bool = Field(False)
|
|
8
|
+
last_run_successful: Optional[bool] = Field(None, alias="lastRunSuccessful")
|
|
9
|
+
last_run_message: Optional[str] = Field(None, alias="lastRunMessage")
|
|
10
|
+
last_run: Optional[datetime] = Field(None, alias="lastRun")
|
|
11
|
+
next_run: Optional[datetime] = Field(None, alias="nextRun")
|
|
12
|
+
|
|
13
|
+
class Config:
|
|
14
|
+
populate_by_name = True
|