hydroserverpy 1.3.1__py3-none-any.whl → 1.4.0b4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of hydroserverpy might be problematic. Click here for more details.
- hydroserverpy/__init__.py +0 -2
- hydroserverpy/api/models/etl/__init__.py +26 -0
- hydroserverpy/api/models/etl/data_source.py +107 -72
- hydroserverpy/api/models/etl/etl_configuration.py +224 -0
- hydroserverpy/api/models/etl/extractors/__init__.py +6 -0
- hydroserverpy/{etl → api/models/etl}/extractors/base.py +16 -19
- hydroserverpy/{etl → api/models/etl}/extractors/http_extractor.py +7 -8
- hydroserverpy/api/models/etl/extractors/local_file_extractor.py +20 -0
- hydroserverpy/api/models/etl/factories.py +23 -0
- hydroserverpy/api/models/etl/loaders/__init__.py +4 -0
- hydroserverpy/{etl → api/models/etl}/loaders/base.py +0 -2
- hydroserverpy/api/models/etl/loaders/hydroserver_loader.py +100 -0
- hydroserverpy/api/models/etl/schedule.py +16 -0
- hydroserverpy/api/models/etl/status.py +14 -0
- hydroserverpy/{etl → api/models/etl}/timestamp_parser.py +4 -1
- hydroserverpy/api/models/etl/transformers/__init__.py +5 -0
- hydroserverpy/api/models/etl/transformers/base.py +137 -0
- hydroserverpy/{etl → api/models/etl}/transformers/csv_transformer.py +24 -13
- hydroserverpy/{etl → api/models/etl}/transformers/json_transformer.py +21 -6
- hydroserverpy/api/services/etl/data_source.py +1 -4
- {hydroserverpy-1.3.1.dist-info → hydroserverpy-1.4.0b4.dist-info}/METADATA +1 -1
- {hydroserverpy-1.3.1.dist-info → hydroserverpy-1.4.0b4.dist-info}/RECORD +28 -29
- hydroserverpy/etl/__init__.py +0 -21
- hydroserverpy/etl/extractors/__init__.py +0 -0
- hydroserverpy/etl/extractors/local_file_extractor.py +0 -19
- hydroserverpy/etl/hydroserver_etl.py +0 -40
- hydroserverpy/etl/loaders/__init__.py +0 -0
- hydroserverpy/etl/loaders/hydroserver_loader.py +0 -71
- hydroserverpy/etl/transformers/__init__.py +0 -0
- hydroserverpy/etl/transformers/base.py +0 -64
- hydroserverpy/etl_csv/__init__.py +0 -0
- hydroserverpy/etl_csv/exceptions.py +0 -14
- hydroserverpy/etl_csv/hydroserver_etl_csv.py +0 -342
- /hydroserverpy/{etl → api/models/etl}/extractors/ftp_extractor.py +0 -0
- /hydroserverpy/{etl → api/models/etl}/types.py +0 -0
- {hydroserverpy-1.3.1.dist-info → hydroserverpy-1.4.0b4.dist-info}/WHEEL +0 -0
- {hydroserverpy-1.3.1.dist-info → hydroserverpy-1.4.0b4.dist-info}/licenses/LICENSE +0 -0
- {hydroserverpy-1.3.1.dist-info → hydroserverpy-1.4.0b4.dist-info}/top_level.txt +0 -0
- {hydroserverpy-1.3.1.dist-info → hydroserverpy-1.4.0b4.dist-info}/zip-safe +0 -0
hydroserverpy/__init__.py
CHANGED
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
from .extractors import Extractor, HTTPExtractor, LocalFileExtractor, FTPExtractor
|
|
2
|
+
from .transformers import JSONTransformer, CSVTransformer, Transformer
|
|
3
|
+
from .loaders import HydroServerLoader, Loader
|
|
4
|
+
|
|
5
|
+
from .etl_configuration import EtlConfiguration
|
|
6
|
+
from .schedule import Schedule
|
|
7
|
+
from .status import Status
|
|
8
|
+
from .orchestration_system import OrchestrationSystem
|
|
9
|
+
from .data_source import DataSource
|
|
10
|
+
|
|
11
|
+
__all__ = [
|
|
12
|
+
"CSVTransformer",
|
|
13
|
+
"JSONTransformer",
|
|
14
|
+
"LocalFileExtractor",
|
|
15
|
+
"FTPExtractor",
|
|
16
|
+
"HTTPExtractor",
|
|
17
|
+
"Extractor",
|
|
18
|
+
"Transformer",
|
|
19
|
+
"Loader",
|
|
20
|
+
"HydroServerLoader",
|
|
21
|
+
"EtlConfiguration",
|
|
22
|
+
"Schedule",
|
|
23
|
+
"Status",
|
|
24
|
+
"OrchestrationSystem",
|
|
25
|
+
"DataSource",
|
|
26
|
+
]
|
|
@@ -1,111 +1,146 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from datetime import datetime, timedelta, timezone
|
|
3
|
+
from functools import cached_property
|
|
4
|
+
import logging
|
|
1
5
|
import uuid
|
|
2
|
-
import
|
|
3
|
-
import
|
|
4
|
-
|
|
6
|
+
from typing import ClassVar, TYPE_CHECKING, List, Optional, Union
|
|
7
|
+
import croniter
|
|
8
|
+
import pandas as pd
|
|
5
9
|
from pydantic import Field
|
|
6
|
-
|
|
7
|
-
from .orchestration_system import OrchestrationSystem
|
|
8
|
-
from .orchestration_configuration import OrchestrationConfigurationFields
|
|
9
|
-
from ..sta.datastream import Datastream
|
|
10
|
+
|
|
10
11
|
from ..base import HydroServerBaseModel
|
|
12
|
+
from ..sta.datastream import Datastream
|
|
13
|
+
from .orchestration_system import OrchestrationSystem
|
|
14
|
+
from .etl_configuration import EtlConfiguration
|
|
15
|
+
from .schedule import Schedule
|
|
16
|
+
from .status import Status
|
|
17
|
+
from .factories import extractor_factory, transformer_factory, loader_factory
|
|
18
|
+
from .loaders import HydroServerLoader
|
|
11
19
|
|
|
12
20
|
if TYPE_CHECKING:
|
|
13
21
|
from hydroserverpy import HydroServer
|
|
14
22
|
from hydroserverpy.api.models import Workspace
|
|
15
23
|
|
|
16
24
|
|
|
17
|
-
class DataSource(
|
|
18
|
-
HydroServerBaseModel, OrchestrationConfigurationFields
|
|
19
|
-
):
|
|
25
|
+
class DataSource(HydroServerBaseModel):
|
|
20
26
|
name: str = Field(..., max_length=255)
|
|
21
|
-
settings:
|
|
27
|
+
settings: EtlConfiguration
|
|
22
28
|
orchestration_system_id: uuid.UUID
|
|
29
|
+
schedule: Schedule
|
|
30
|
+
status: Status
|
|
23
31
|
workspace_id: uuid.UUID
|
|
24
32
|
|
|
25
33
|
_editable_fields: ClassVar[set[str]] = {
|
|
26
|
-
"name",
|
|
27
|
-
"
|
|
34
|
+
"name",
|
|
35
|
+
"settings",
|
|
36
|
+
"status",
|
|
37
|
+
"schedule",
|
|
38
|
+
"interval",
|
|
39
|
+
"interval_units",
|
|
40
|
+
"crontab",
|
|
41
|
+
"start_time",
|
|
42
|
+
"end_time",
|
|
43
|
+
"last_run_successful",
|
|
44
|
+
"last_run_message",
|
|
45
|
+
"last_run",
|
|
46
|
+
"next_run",
|
|
47
|
+
"paused",
|
|
28
48
|
}
|
|
29
49
|
|
|
30
|
-
def __init__(self, client:
|
|
50
|
+
def __init__(self, client: HydroServer, **data):
|
|
31
51
|
super().__init__(client=client, service=client.datasources, **data)
|
|
32
52
|
|
|
33
|
-
self._workspace = None
|
|
34
|
-
self._orchestration_system = None
|
|
35
|
-
self._datastreams = None
|
|
36
|
-
|
|
37
53
|
@classmethod
|
|
38
54
|
def get_route(cls):
|
|
39
55
|
return "data-sources"
|
|
40
56
|
|
|
41
|
-
@
|
|
42
|
-
def workspace(self) ->
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
if self._workspace is None:
|
|
46
|
-
self._workspace = self.client.workspaces.get(uid=self.workspace_id)
|
|
47
|
-
|
|
48
|
-
return self._workspace
|
|
49
|
-
|
|
50
|
-
@property
|
|
51
|
-
def orchestration_system(self) -> "OrchestrationSystem":
|
|
52
|
-
"""The orchestration system that manages this data source."""
|
|
57
|
+
@cached_property
|
|
58
|
+
def workspace(self) -> Workspace:
|
|
59
|
+
return self.client.workspaces.get(uid=self.workspace_id)
|
|
53
60
|
|
|
54
|
-
|
|
55
|
-
|
|
61
|
+
@cached_property
|
|
62
|
+
def orchestration_system(self) -> OrchestrationSystem:
|
|
63
|
+
return self.client.orchestrationsystems.get(uid=self.orchestration_system_id)
|
|
56
64
|
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
def datastreams(self) -> List["Datastream"]:
|
|
61
|
-
"""The datastreams this data source provides data for."""
|
|
62
|
-
|
|
63
|
-
if self._datastreams is None:
|
|
64
|
-
self._datastreams = self.client.datastreams.list(data_source=self.uid, fetch_all=True).items
|
|
65
|
-
|
|
66
|
-
return self._datastreams
|
|
65
|
+
@cached_property
|
|
66
|
+
def datastreams(self) -> List[Datastream]:
|
|
67
|
+
return self.client.datastreams.list(data_source=self.uid, fetch_all=True).items
|
|
67
68
|
|
|
69
|
+
# TODO: Add functions like add_payload, add_mapping, etc. and don't allow the user to manually
|
|
70
|
+
# link or unlink datastreams - handle that automatically.
|
|
68
71
|
def add_datastream(self, datastream: Union["Datastream", uuid.UUID, str]):
|
|
69
72
|
"""Add a datastream to this data source."""
|
|
70
73
|
|
|
71
|
-
self.client.datasources.add_datastream(
|
|
72
|
-
uid=self.uid, datastream=datastream
|
|
73
|
-
)
|
|
74
|
+
self.client.datasources.add_datastream(uid=self.uid, datastream=datastream)
|
|
74
75
|
|
|
75
76
|
def remove_datastream(self, datastream: Union["Datastream", uuid.UUID, str]):
|
|
76
77
|
"""Remove a datastream from this data source."""
|
|
77
78
|
|
|
78
|
-
self.client.datasources.remove_datastream(
|
|
79
|
-
|
|
79
|
+
self.client.datasources.remove_datastream(uid=self.uid, datastream=datastream)
|
|
80
|
+
|
|
81
|
+
def _next_run(self) -> Optional[str]:
|
|
82
|
+
now = datetime.now(timezone.utc)
|
|
83
|
+
if cron := self.schedule.crontab:
|
|
84
|
+
return croniter.croniter(cron, now).get_next(datetime).isoformat()
|
|
85
|
+
if iv := self.schedule.interval:
|
|
86
|
+
unit = self.schedule.interval_units or "minutes"
|
|
87
|
+
return (now + timedelta(**{unit: iv})).isoformat()
|
|
88
|
+
return None
|
|
89
|
+
|
|
90
|
+
def _update_status(self, loader: HydroServerLoader, success: bool, msg: str):
|
|
91
|
+
short_msg = msg if len(msg) <= 255 else msg[:252] + "…"
|
|
92
|
+
loader.client.datasources.update(
|
|
93
|
+
uid=self.uid,
|
|
94
|
+
last_run=datetime.now(timezone.utc).isoformat(),
|
|
95
|
+
last_run_successful=success,
|
|
96
|
+
last_run_message=short_msg,
|
|
97
|
+
next_run=self._next_run(),
|
|
80
98
|
)
|
|
81
99
|
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
100
|
+
def is_empty(self, data):
|
|
101
|
+
if data is None:
|
|
102
|
+
return True
|
|
103
|
+
if isinstance(data, pd.DataFrame) and data.empty:
|
|
104
|
+
return True
|
|
105
|
+
return False
|
|
85
106
|
|
|
86
|
-
|
|
107
|
+
def load_data(self, payload_name: str = None):
|
|
108
|
+
"""Load data for this data source."""
|
|
109
|
+
if self.status.paused is True:
|
|
87
110
|
return
|
|
88
111
|
|
|
89
|
-
if
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
)
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
112
|
+
if payload_name:
|
|
113
|
+
self.load_data_for_payload(payload_name)
|
|
114
|
+
else:
|
|
115
|
+
for p in self.settings.payloads:
|
|
116
|
+
self.load_data_for_payload(p.name)
|
|
117
|
+
|
|
118
|
+
def load_data_for_payload(self, payload_name: str):
|
|
119
|
+
payload = next(p for p in self.settings.payloads if p.name == payload_name)
|
|
120
|
+
|
|
121
|
+
extractor_cls = extractor_factory(self.settings.extractor)
|
|
122
|
+
transformer_cls = transformer_factory(self.settings.transformer)
|
|
123
|
+
loader_cls = loader_factory(self.settings.loader, self.client, self.uid)
|
|
124
|
+
|
|
125
|
+
try:
|
|
126
|
+
logging.info("Starting extract")
|
|
127
|
+
data = extractor_cls.extract(payload, loader_cls)
|
|
128
|
+
if self.is_empty(data):
|
|
129
|
+
self._update_status(
|
|
130
|
+
loader_cls, True, "No data returned from the extractor"
|
|
101
131
|
)
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
self.client, data_file=temp_file, data_source=self
|
|
132
|
+
return
|
|
133
|
+
|
|
134
|
+
logging.info("Starting transform")
|
|
135
|
+
data = transformer_cls.transform(data, payload.mappings)
|
|
136
|
+
if self.is_empty(data):
|
|
137
|
+
self._update_status(
|
|
138
|
+
loader_cls, True, "No data returned from the transformer"
|
|
110
139
|
)
|
|
111
|
-
|
|
140
|
+
return
|
|
141
|
+
|
|
142
|
+
logging.info("Starting load")
|
|
143
|
+
loader_cls.load(data, payload)
|
|
144
|
+
self._update_status(loader_cls, True, "OK")
|
|
145
|
+
except Exception as e:
|
|
146
|
+
self._update_status(loader_cls, False, str(e))
|
|
@@ -0,0 +1,224 @@
|
|
|
1
|
+
from typing import Annotated, Dict, List, Literal, Optional, Union
|
|
2
|
+
from pydantic import BaseModel, Field, field_validator
|
|
3
|
+
from enum import Enum
|
|
4
|
+
|
|
5
|
+
WorkflowType = Literal["ETL", "Aggregation", "Virtual", "SDL"]
|
|
6
|
+
CSVDelimiterType = Literal[",", "|", "\t", ";", " "]
|
|
7
|
+
ExtractorType = Literal["HTTP", "local"]
|
|
8
|
+
TransformerType = Literal["JSON", "CSV"]
|
|
9
|
+
LoaderType = Literal["HydroServer"]
|
|
10
|
+
IdentifierType = Literal["name", "index"]
|
|
11
|
+
RunTimeValue = Literal["jobExecutionTime", "latestObservationTimestamp"]
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class FixedOffsetTimezone(str, Enum):
|
|
15
|
+
UTC_MINUS_1200 = "-1200"
|
|
16
|
+
UTC_MINUS_1100 = "-1100"
|
|
17
|
+
UTC_MINUS_1000 = "-1000"
|
|
18
|
+
UTC_MINUS_0900 = "-0900"
|
|
19
|
+
UTC_MINUS_0800 = "-0800"
|
|
20
|
+
UTC_MINUS_0700 = "-0700"
|
|
21
|
+
UTC_MINUS_0600 = "-0600"
|
|
22
|
+
UTC_MINUS_0500 = "-0500"
|
|
23
|
+
UTC_MINUS_0430 = "-0430"
|
|
24
|
+
UTC_MINUS_0400 = "-0400"
|
|
25
|
+
UTC_MINUS_0330 = "-0330"
|
|
26
|
+
UTC_MINUS_0300 = "-0300"
|
|
27
|
+
UTC_MINUS_0200 = "-0200"
|
|
28
|
+
UTC_MINUS_0100 = "-0100"
|
|
29
|
+
UTC_PLUS_0000 = "+0000"
|
|
30
|
+
UTC_PLUS_0100 = "+0100"
|
|
31
|
+
UTC_PLUS_0200 = "+0200"
|
|
32
|
+
UTC_PLUS_0300 = "+0300"
|
|
33
|
+
UTC_PLUS_0330 = "+0330"
|
|
34
|
+
UTC_PLUS_0400 = "+0400"
|
|
35
|
+
UTC_PLUS_0430 = "+0430"
|
|
36
|
+
UTC_PLUS_0500 = "+0500"
|
|
37
|
+
UTC_PLUS_0530 = "+0530"
|
|
38
|
+
UTC_PLUS_0545 = "+0545"
|
|
39
|
+
UTC_PLUS_0600 = "+0600"
|
|
40
|
+
UTC_PLUS_0630 = "+0630"
|
|
41
|
+
UTC_PLUS_0700 = "+0700"
|
|
42
|
+
UTC_PLUS_0800 = "+0800"
|
|
43
|
+
UTC_PLUS_0845 = "+0845"
|
|
44
|
+
UTC_PLUS_0900 = "+0900"
|
|
45
|
+
UTC_PLUS_0930 = "+0930"
|
|
46
|
+
UTC_PLUS_1000 = "+1000"
|
|
47
|
+
UTC_PLUS_1030 = "+1030"
|
|
48
|
+
UTC_PLUS_1100 = "+1100"
|
|
49
|
+
UTC_PLUS_1130 = "+1130"
|
|
50
|
+
UTC_PLUS_1200 = "+1200"
|
|
51
|
+
UTC_PLUS_1245 = "+1245"
|
|
52
|
+
UTC_PLUS_1300 = "+1300"
|
|
53
|
+
UTC_PLUS_1400 = "+1400"
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class TimestampFormat(str, Enum):
|
|
57
|
+
ISO8601 = "ISO8601"
|
|
58
|
+
naive = "naive"
|
|
59
|
+
custom = "custom"
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class TimezoneMode(str, Enum):
|
|
63
|
+
utc = "utc" # always UTC
|
|
64
|
+
daylightSavings = "daylightSavings" # IANA / DST-aware
|
|
65
|
+
fixedOffset = "fixedOffset" # constant offset
|
|
66
|
+
embeddedOffset = "embeddedOffset" # offset in ISO string
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
class Timestamp(BaseModel):
|
|
70
|
+
key: Optional[str] = None
|
|
71
|
+
format: TimestampFormat
|
|
72
|
+
custom_format: Optional[str] = Field(None, alias="customFormat")
|
|
73
|
+
timezone_mode: TimezoneMode = Field(..., alias="timezoneMode")
|
|
74
|
+
timezone: Optional[Union[FixedOffsetTimezone, str]] = Field(None, alias="timezone")
|
|
75
|
+
|
|
76
|
+
class Config:
|
|
77
|
+
allow_population_by_field_name = True
|
|
78
|
+
|
|
79
|
+
@field_validator("timezone")
|
|
80
|
+
def check_timezone(cls, timezone_value, info):
|
|
81
|
+
mode = info.data.get("timezone_mode")
|
|
82
|
+
if mode == TimezoneMode.fixedOffset and timezone_value is None:
|
|
83
|
+
raise ValueError("`timezone` must be set when timezoneMode is fixedOffset")
|
|
84
|
+
return timezone_value
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
class PerPayloadPlaceholder(BaseModel):
|
|
88
|
+
name: str
|
|
89
|
+
type: Literal["perPayload"]
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
class RunTimePlaceholder(BaseModel):
|
|
93
|
+
name: str
|
|
94
|
+
type: Literal["runTime"]
|
|
95
|
+
run_time_value: RunTimeValue = Field(..., alias="runTimeValue")
|
|
96
|
+
timestamp: Timestamp
|
|
97
|
+
|
|
98
|
+
class Config:
|
|
99
|
+
allow_population_by_field_name = True
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
PlaceholderVariable = Annotated[
|
|
103
|
+
Union[PerPayloadPlaceholder, RunTimePlaceholder],
|
|
104
|
+
Field(discriminator="type"),
|
|
105
|
+
]
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
class BaseExtractor(BaseModel):
|
|
109
|
+
type: ExtractorType
|
|
110
|
+
source_uri: str = Field(..., alias="sourceUri")
|
|
111
|
+
placeholder_variables: Optional[List[PlaceholderVariable]] = Field(
|
|
112
|
+
default_factory=list,
|
|
113
|
+
alias="placeholderVariables",
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
class Config:
|
|
117
|
+
allow_population_by_field_name = True
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
class HTTPExtractor(BaseExtractor):
|
|
121
|
+
type: Literal["HTTP"]
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
class LocalFileExtractor(BaseExtractor):
|
|
125
|
+
type: Literal["local"]
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
ExtractorConfig = Annotated[
|
|
129
|
+
Union[HTTPExtractor, LocalFileExtractor], Field(discriminator="type")
|
|
130
|
+
]
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
class BaseTransformer(BaseModel):
|
|
134
|
+
type: TransformerType
|
|
135
|
+
timestamp: Timestamp
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
class JSONTransformer(BaseTransformer):
|
|
139
|
+
type: Literal["JSON"]
|
|
140
|
+
jmespath: str = Field(..., alias="JMESPath")
|
|
141
|
+
|
|
142
|
+
class Config:
|
|
143
|
+
allow_population_by_field_name = True
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
class CSVTransformer(BaseTransformer):
|
|
147
|
+
type: Literal["CSV"]
|
|
148
|
+
header_row: Optional[int] = Field(..., alias="headerRow")
|
|
149
|
+
data_start_row: int = Field(..., alias="dataStartRow")
|
|
150
|
+
delimiter: CSVDelimiterType
|
|
151
|
+
identifier_type: IdentifierType = Field(..., alias="identifierType")
|
|
152
|
+
|
|
153
|
+
class Config:
|
|
154
|
+
allow_population_by_field_name = True
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
TransformerConfig = Union[JSONTransformer, CSVTransformer]
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
class BaseLoaderConfig(BaseModel):
|
|
161
|
+
type: LoaderType
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
class HydroServerLoaderConfig(BaseLoaderConfig):
|
|
165
|
+
type: Literal["HydroServer"]
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
LoaderConfig = HydroServerLoaderConfig
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
class ExpressionDataTransformation(BaseModel):
|
|
172
|
+
type: Literal["expression"]
|
|
173
|
+
expression: str
|
|
174
|
+
|
|
175
|
+
class Config:
|
|
176
|
+
allow_population_by_field_name = True
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
class LookupTableDataTransformation(BaseModel):
|
|
180
|
+
type: Literal["lookup"]
|
|
181
|
+
lookup_table_id: str = Field(..., alias="lookupTableId")
|
|
182
|
+
|
|
183
|
+
class Config:
|
|
184
|
+
allow_population_by_field_name = True
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
DataTransformation = Union[ExpressionDataTransformation, LookupTableDataTransformation]
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
class MappingPath(BaseModel):
|
|
191
|
+
target_identifier: Union[str, int] = Field(..., alias="targetIdentifier")
|
|
192
|
+
data_transformations: List[DataTransformation] = Field(
|
|
193
|
+
default_factory=list, alias="dataTransformations"
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
class Config:
|
|
197
|
+
allow_population_by_field_name = True
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
class SourceTargetMapping(BaseModel):
|
|
201
|
+
source_identifier: Union[str, int] = Field(..., alias="sourceIdentifier")
|
|
202
|
+
paths: List[MappingPath] = Field(default_factory=list)
|
|
203
|
+
|
|
204
|
+
class Config:
|
|
205
|
+
allow_population_by_field_name = True
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
class Payload(BaseModel):
|
|
209
|
+
name: str = ""
|
|
210
|
+
mappings: List[SourceTargetMapping] = Field(default_factory=list)
|
|
211
|
+
extractor_variables: Dict[str, str] = Field(
|
|
212
|
+
default_factory=dict, alias="extractorVariables"
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
class Config:
|
|
216
|
+
allow_population_by_field_name = True
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
class EtlConfiguration(BaseModel):
|
|
220
|
+
type: WorkflowType
|
|
221
|
+
extractor: ExtractorConfig
|
|
222
|
+
transformer: TransformerConfig
|
|
223
|
+
loader: LoaderConfig
|
|
224
|
+
payloads: List[Payload]
|
|
@@ -2,49 +2,46 @@ from abc import abstractmethod
|
|
|
2
2
|
import logging
|
|
3
3
|
import pandas as pd
|
|
4
4
|
from datetime import datetime
|
|
5
|
-
|
|
6
|
-
from
|
|
5
|
+
from ..etl_configuration import ExtractorConfig, Payload
|
|
6
|
+
from ..timestamp_parser import TimestampParser
|
|
7
7
|
|
|
8
8
|
|
|
9
9
|
class Extractor:
|
|
10
|
-
def __init__(self,
|
|
11
|
-
self.
|
|
12
|
-
self.source_uri = settings["sourceUri"]
|
|
10
|
+
def __init__(self, extractor_config: ExtractorConfig):
|
|
11
|
+
self.cfg = extractor_config
|
|
13
12
|
|
|
14
|
-
def resolve_placeholder_variables(self, payload, loader):
|
|
13
|
+
def resolve_placeholder_variables(self, payload: Payload, loader):
|
|
15
14
|
logging.info(f"Creating runtime variables...")
|
|
16
15
|
filled = {}
|
|
17
|
-
for
|
|
18
|
-
name =
|
|
19
|
-
var_type = var.get("type", None)
|
|
16
|
+
for placeholder in self.cfg.placeholder_variables:
|
|
17
|
+
name = placeholder.name
|
|
20
18
|
|
|
21
|
-
if
|
|
19
|
+
if placeholder.type == "runTime":
|
|
22
20
|
logging.info(f"Resolving runtime var: {name}")
|
|
23
|
-
if
|
|
21
|
+
if placeholder.run_time_value == "latestObservationTimestamp":
|
|
24
22
|
value = loader.earliest_begin_date(payload)
|
|
25
|
-
elif
|
|
23
|
+
elif placeholder.run_time_value == "jobExecutionTime":
|
|
26
24
|
value = pd.Timestamp.now(tz="UTC")
|
|
27
|
-
elif
|
|
25
|
+
elif placeholder.type == "perPayload":
|
|
28
26
|
logging.info(f"Resolving payload var: {name}")
|
|
29
|
-
|
|
30
|
-
if name not in payload_vars:
|
|
27
|
+
if name not in payload.extractor_variables:
|
|
31
28
|
raise KeyError(f"Missing per-payload variable '{name}'")
|
|
32
|
-
value =
|
|
29
|
+
value = payload.extractor_variables[name]
|
|
33
30
|
else:
|
|
34
31
|
continue
|
|
35
32
|
|
|
36
33
|
if isinstance(value, (datetime, pd.Timestamp)):
|
|
37
|
-
parser = TimestampParser(
|
|
34
|
+
parser = TimestampParser(placeholder.timestamp)
|
|
38
35
|
value = parser.utc_to_string(value)
|
|
39
36
|
|
|
40
37
|
filled[name] = value
|
|
41
38
|
if not filled:
|
|
42
|
-
return self.source_uri
|
|
39
|
+
return self.cfg.source_uri
|
|
43
40
|
return self.format_uri(filled)
|
|
44
41
|
|
|
45
42
|
def format_uri(self, placeholder_variables):
|
|
46
43
|
try:
|
|
47
|
-
uri = self.source_uri.format(**placeholder_variables)
|
|
44
|
+
uri = self.cfg.source_uri.format(**placeholder_variables)
|
|
48
45
|
except KeyError as e:
|
|
49
46
|
missing_key = e.args[0]
|
|
50
47
|
raise KeyError(f"Missing placeholder variable: {missing_key}")
|
|
@@ -1,25 +1,24 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
import requests
|
|
3
3
|
from io import BytesIO
|
|
4
|
-
|
|
4
|
+
|
|
5
|
+
from ..etl_configuration import Payload
|
|
6
|
+
from .base import Extractor, ExtractorConfig
|
|
5
7
|
|
|
6
8
|
|
|
7
9
|
class HTTPExtractor(Extractor):
|
|
8
|
-
def __init__(self, settings:
|
|
10
|
+
def __init__(self, settings: ExtractorConfig):
|
|
9
11
|
super().__init__(settings)
|
|
10
12
|
|
|
11
|
-
def extract(self, payload, loader=None):
|
|
13
|
+
def extract(self, payload: Payload, loader=None):
|
|
12
14
|
"""
|
|
13
15
|
Downloads the file from the HTTP/HTTPS server and returns a file-like object.
|
|
14
16
|
"""
|
|
15
17
|
url = self.resolve_placeholder_variables(payload, loader)
|
|
16
18
|
logging.info(f"Requesting data from → {url}")
|
|
17
19
|
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
except Exception as e:
|
|
21
|
-
logging.error(f"Failed to fetch {url}: {e}")
|
|
22
|
-
raise
|
|
20
|
+
response = requests.get(url)
|
|
21
|
+
response.raise_for_status()
|
|
23
22
|
|
|
24
23
|
data = BytesIO()
|
|
25
24
|
for chunk in response.iter_content(chunk_size=8192):
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from .base import Extractor
|
|
3
|
+
from ..etl_configuration import ExtractorConfig
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class LocalFileExtractor(Extractor):
|
|
7
|
+
def __init__(self, extractor_config: ExtractorConfig):
|
|
8
|
+
super().__init__(extractor_config)
|
|
9
|
+
|
|
10
|
+
def extract(self):
|
|
11
|
+
"""
|
|
12
|
+
Opens the file and returns a file-like object.
|
|
13
|
+
"""
|
|
14
|
+
try:
|
|
15
|
+
file_handle = open(self.cfg.source_uri, "r")
|
|
16
|
+
logging.info(f"Successfully opened file '{self.cfg.source_uri}'.")
|
|
17
|
+
return file_handle
|
|
18
|
+
except Exception as e:
|
|
19
|
+
logging.error(f"Error opening file '{self.cfg.source_uri}': {e}")
|
|
20
|
+
return None
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
from .extractors import HTTPExtractor, LocalFileExtractor
|
|
2
|
+
from .transformers import JSONTransformer, CSVTransformer
|
|
3
|
+
from .loaders import HydroServerLoader
|
|
4
|
+
from .etl_configuration import ExtractorConfig, TransformerConfig, LoaderConfig
|
|
5
|
+
|
|
6
|
+
EXTRACTORS = {"HTTP": HTTPExtractor, "local": LocalFileExtractor}
|
|
7
|
+
TRANSFORMERS = {"JSON": JSONTransformer, "CSV": CSVTransformer}
|
|
8
|
+
LOADERS = {"HydroServer": HydroServerLoader}
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def extractor_factory(settings: ExtractorConfig):
|
|
12
|
+
cls = EXTRACTORS[settings.type]
|
|
13
|
+
return cls(settings)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def transformer_factory(settings: TransformerConfig):
|
|
17
|
+
cls = TRANSFORMERS[settings.type]
|
|
18
|
+
return cls(settings)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def loader_factory(settings: LoaderConfig, auth_context, data_source_id: str):
|
|
22
|
+
cls = LOADERS[settings.type]
|
|
23
|
+
return cls(auth_context, data_source_id)
|