hydroserverpy 1.3.0b3__tar.gz → 1.4.0b3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {hydroserverpy-1.3.0b3/src/hydroserverpy.egg-info → hydroserverpy-1.4.0b3}/PKG-INFO +1 -1
- {hydroserverpy-1.3.0b3 → hydroserverpy-1.4.0b3}/README.md +2 -1
- {hydroserverpy-1.3.0b3 → hydroserverpy-1.4.0b3}/setup.cfg +1 -1
- {hydroserverpy-1.3.0b3 → hydroserverpy-1.4.0b3}/src/hydroserverpy/__init__.py +0 -2
- hydroserverpy-1.4.0b3/src/hydroserverpy/api/models/etl/__init__.py +26 -0
- hydroserverpy-1.4.0b3/src/hydroserverpy/api/models/etl/data_source.py +146 -0
- hydroserverpy-1.4.0b3/src/hydroserverpy/api/models/etl/etl_configuration.py +224 -0
- hydroserverpy-1.4.0b3/src/hydroserverpy/api/models/etl/extractors/__init__.py +6 -0
- {hydroserverpy-1.3.0b3/src/hydroserverpy → hydroserverpy-1.4.0b3/src/hydroserverpy/api/models}/etl/extractors/base.py +16 -19
- {hydroserverpy-1.3.0b3/src/hydroserverpy → hydroserverpy-1.4.0b3/src/hydroserverpy/api/models}/etl/extractors/http_extractor.py +5 -3
- hydroserverpy-1.4.0b3/src/hydroserverpy/api/models/etl/extractors/local_file_extractor.py +20 -0
- hydroserverpy-1.4.0b3/src/hydroserverpy/api/models/etl/factories.py +23 -0
- hydroserverpy-1.4.0b3/src/hydroserverpy/api/models/etl/loaders/__init__.py +4 -0
- {hydroserverpy-1.3.0b3/src/hydroserverpy → hydroserverpy-1.4.0b3/src/hydroserverpy/api/models}/etl/loaders/base.py +0 -2
- hydroserverpy-1.4.0b3/src/hydroserverpy/api/models/etl/loaders/hydroserver_loader.py +100 -0
- hydroserverpy-1.4.0b3/src/hydroserverpy/api/models/etl/schedule.py +16 -0
- hydroserverpy-1.4.0b3/src/hydroserverpy/api/models/etl/status.py +14 -0
- hydroserverpy-1.4.0b3/src/hydroserverpy/api/models/etl/transformers/__init__.py +5 -0
- hydroserverpy-1.4.0b3/src/hydroserverpy/api/models/etl/transformers/base.py +128 -0
- {hydroserverpy-1.3.0b3/src/hydroserverpy → hydroserverpy-1.4.0b3/src/hydroserverpy/api/models}/etl/transformers/csv_transformer.py +24 -13
- {hydroserverpy-1.3.0b3/src/hydroserverpy → hydroserverpy-1.4.0b3/src/hydroserverpy/api/models}/etl/transformers/json_transformer.py +7 -6
- {hydroserverpy-1.3.0b3 → hydroserverpy-1.4.0b3}/src/hydroserverpy/api/services/etl/data_source.py +1 -4
- {hydroserverpy-1.3.0b3 → hydroserverpy-1.4.0b3/src/hydroserverpy.egg-info}/PKG-INFO +1 -1
- {hydroserverpy-1.3.0b3 → hydroserverpy-1.4.0b3}/src/hydroserverpy.egg-info/SOURCES.txt +18 -19
- hydroserverpy-1.3.0b3/src/hydroserverpy/api/models/etl/data_source.py +0 -111
- hydroserverpy-1.3.0b3/src/hydroserverpy/api/services/sta/__init__.py +0 -0
- hydroserverpy-1.3.0b3/src/hydroserverpy/etl/__init__.py +0 -21
- hydroserverpy-1.3.0b3/src/hydroserverpy/etl/extractors/__init__.py +0 -0
- hydroserverpy-1.3.0b3/src/hydroserverpy/etl/extractors/local_file_extractor.py +0 -19
- hydroserverpy-1.3.0b3/src/hydroserverpy/etl/hydroserver_etl.py +0 -40
- hydroserverpy-1.3.0b3/src/hydroserverpy/etl/loaders/__init__.py +0 -0
- hydroserverpy-1.3.0b3/src/hydroserverpy/etl/loaders/hydroserver_loader.py +0 -71
- hydroserverpy-1.3.0b3/src/hydroserverpy/etl/transformers/__init__.py +0 -0
- hydroserverpy-1.3.0b3/src/hydroserverpy/etl/transformers/base.py +0 -64
- hydroserverpy-1.3.0b3/src/hydroserverpy/etl_csv/__init__.py +0 -0
- hydroserverpy-1.3.0b3/src/hydroserverpy/etl_csv/exceptions.py +0 -14
- hydroserverpy-1.3.0b3/src/hydroserverpy/etl_csv/hydroserver_etl_csv.py +0 -346
- {hydroserverpy-1.3.0b3 → hydroserverpy-1.4.0b3}/LICENSE +0 -0
- {hydroserverpy-1.3.0b3 → hydroserverpy-1.4.0b3}/pyproject.toml +0 -0
- {hydroserverpy-1.3.0b3 → hydroserverpy-1.4.0b3}/setup.py +0 -0
- {hydroserverpy-1.3.0b3 → hydroserverpy-1.4.0b3}/src/hydroserverpy/api/__init__.py +0 -0
- {hydroserverpy-1.3.0b3 → hydroserverpy-1.4.0b3}/src/hydroserverpy/api/client.py +0 -0
- {hydroserverpy-1.3.0b3 → hydroserverpy-1.4.0b3}/src/hydroserverpy/api/models/__init__.py +0 -0
- {hydroserverpy-1.3.0b3 → hydroserverpy-1.4.0b3}/src/hydroserverpy/api/models/base.py +0 -0
- {hydroserverpy-1.3.0b3 → hydroserverpy-1.4.0b3}/src/hydroserverpy/api/models/etl/data_archive.py +0 -0
- {hydroserverpy-1.3.0b3/src/hydroserverpy → hydroserverpy-1.4.0b3/src/hydroserverpy/api/models}/etl/extractors/ftp_extractor.py +0 -0
- {hydroserverpy-1.3.0b3 → hydroserverpy-1.4.0b3}/src/hydroserverpy/api/models/etl/orchestration_configuration.py +0 -0
- {hydroserverpy-1.3.0b3 → hydroserverpy-1.4.0b3}/src/hydroserverpy/api/models/etl/orchestration_system.py +0 -0
- {hydroserverpy-1.3.0b3/src/hydroserverpy → hydroserverpy-1.4.0b3/src/hydroserverpy/api/models}/etl/timestamp_parser.py +0 -0
- {hydroserverpy-1.3.0b3/src/hydroserverpy → hydroserverpy-1.4.0b3/src/hydroserverpy/api/models}/etl/types.py +0 -0
- {hydroserverpy-1.3.0b3/src/hydroserverpy/api/models/etl → hydroserverpy-1.4.0b3/src/hydroserverpy/api/models/iam}/__init__.py +0 -0
- {hydroserverpy-1.3.0b3 → hydroserverpy-1.4.0b3}/src/hydroserverpy/api/models/iam/account.py +0 -0
- {hydroserverpy-1.3.0b3 → hydroserverpy-1.4.0b3}/src/hydroserverpy/api/models/iam/apikey.py +0 -0
- {hydroserverpy-1.3.0b3 → hydroserverpy-1.4.0b3}/src/hydroserverpy/api/models/iam/collaborator.py +0 -0
- {hydroserverpy-1.3.0b3 → hydroserverpy-1.4.0b3}/src/hydroserverpy/api/models/iam/role.py +0 -0
- {hydroserverpy-1.3.0b3 → hydroserverpy-1.4.0b3}/src/hydroserverpy/api/models/iam/workspace.py +0 -0
- {hydroserverpy-1.3.0b3/src/hydroserverpy/api/models/iam → hydroserverpy-1.4.0b3/src/hydroserverpy/api/models/sta}/__init__.py +0 -0
- {hydroserverpy-1.3.0b3 → hydroserverpy-1.4.0b3}/src/hydroserverpy/api/models/sta/datastream.py +0 -0
- {hydroserverpy-1.3.0b3 → hydroserverpy-1.4.0b3}/src/hydroserverpy/api/models/sta/observation.py +0 -0
- {hydroserverpy-1.3.0b3 → hydroserverpy-1.4.0b3}/src/hydroserverpy/api/models/sta/observed_property.py +0 -0
- {hydroserverpy-1.3.0b3 → hydroserverpy-1.4.0b3}/src/hydroserverpy/api/models/sta/processing_level.py +0 -0
- {hydroserverpy-1.3.0b3 → hydroserverpy-1.4.0b3}/src/hydroserverpy/api/models/sta/result_qualifier.py +0 -0
- {hydroserverpy-1.3.0b3 → hydroserverpy-1.4.0b3}/src/hydroserverpy/api/models/sta/sensor.py +0 -0
- {hydroserverpy-1.3.0b3 → hydroserverpy-1.4.0b3}/src/hydroserverpy/api/models/sta/thing.py +0 -0
- {hydroserverpy-1.3.0b3 → hydroserverpy-1.4.0b3}/src/hydroserverpy/api/models/sta/unit.py +0 -0
- {hydroserverpy-1.3.0b3 → hydroserverpy-1.4.0b3}/src/hydroserverpy/api/services/__init__.py +0 -0
- {hydroserverpy-1.3.0b3 → hydroserverpy-1.4.0b3}/src/hydroserverpy/api/services/base.py +0 -0
- {hydroserverpy-1.3.0b3/src/hydroserverpy/api/models/sta → hydroserverpy-1.4.0b3/src/hydroserverpy/api/services/etl}/__init__.py +0 -0
- {hydroserverpy-1.3.0b3 → hydroserverpy-1.4.0b3}/src/hydroserverpy/api/services/etl/data_archive.py +0 -0
- {hydroserverpy-1.3.0b3 → hydroserverpy-1.4.0b3}/src/hydroserverpy/api/services/etl/orchestration_system.py +0 -0
- {hydroserverpy-1.3.0b3/src/hydroserverpy/api/services/etl → hydroserverpy-1.4.0b3/src/hydroserverpy/api/services/iam}/__init__.py +0 -0
- {hydroserverpy-1.3.0b3 → hydroserverpy-1.4.0b3}/src/hydroserverpy/api/services/iam/role.py +0 -0
- {hydroserverpy-1.3.0b3 → hydroserverpy-1.4.0b3}/src/hydroserverpy/api/services/iam/workspace.py +0 -0
- {hydroserverpy-1.3.0b3/src/hydroserverpy/api/services/iam → hydroserverpy-1.4.0b3/src/hydroserverpy/api/services/sta}/__init__.py +0 -0
- {hydroserverpy-1.3.0b3 → hydroserverpy-1.4.0b3}/src/hydroserverpy/api/services/sta/datastream.py +0 -0
- {hydroserverpy-1.3.0b3 → hydroserverpy-1.4.0b3}/src/hydroserverpy/api/services/sta/observed_property.py +0 -0
- {hydroserverpy-1.3.0b3 → hydroserverpy-1.4.0b3}/src/hydroserverpy/api/services/sta/processing_level.py +0 -0
- {hydroserverpy-1.3.0b3 → hydroserverpy-1.4.0b3}/src/hydroserverpy/api/services/sta/result_qualifier.py +0 -0
- {hydroserverpy-1.3.0b3 → hydroserverpy-1.4.0b3}/src/hydroserverpy/api/services/sta/sensor.py +0 -0
- {hydroserverpy-1.3.0b3 → hydroserverpy-1.4.0b3}/src/hydroserverpy/api/services/sta/thing.py +0 -0
- {hydroserverpy-1.3.0b3 → hydroserverpy-1.4.0b3}/src/hydroserverpy/api/services/sta/unit.py +0 -0
- {hydroserverpy-1.3.0b3 → hydroserverpy-1.4.0b3}/src/hydroserverpy/api/utils.py +0 -0
- {hydroserverpy-1.3.0b3 → hydroserverpy-1.4.0b3}/src/hydroserverpy/quality/__init__.py +0 -0
- {hydroserverpy-1.3.0b3 → hydroserverpy-1.4.0b3}/src/hydroserverpy/quality/service.py +0 -0
- {hydroserverpy-1.3.0b3 → hydroserverpy-1.4.0b3}/src/hydroserverpy.egg-info/dependency_links.txt +0 -0
- {hydroserverpy-1.3.0b3 → hydroserverpy-1.4.0b3}/src/hydroserverpy.egg-info/requires.txt +0 -0
- {hydroserverpy-1.3.0b3 → hydroserverpy-1.4.0b3}/src/hydroserverpy.egg-info/top_level.txt +0 -0
- {hydroserverpy-1.3.0b3 → hydroserverpy-1.4.0b3}/src/hydroserverpy.egg-info/zip-safe +0 -0
|
@@ -40,4 +40,5 @@ hs_api = HydroServer(
|
|
|
40
40
|
|
|
41
41
|
## Funding and Acknowledgements
|
|
42
42
|
|
|
43
|
-
Funding for this project was provided by the National Oceanic & Atmospheric Administration (NOAA), awarded to the Cooperative Institute for Research to Operations in Hydrology (CIROH) through the NOAA Cooperative Agreement with The University of Alabama (NA22NWS4320003).
|
|
43
|
+
Funding for this project was provided by the National Oceanic & Atmospheric Administration (NOAA), awarded to the Cooperative Institute for Research to Operations in Hydrology (CIROH) through the NOAA Cooperative Agreement with The University of Alabama (NA22NWS4320003). Utah State University is a founding member of CIROH and receives funding under subaward from the University of Alabama. Additional funding and support have been provided by the State of Utah Division of Water Rights, the World Meorological Organization, and the Utah Water Research laboratory at Utah State University.
|
|
44
|
+
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
from .extractors import Extractor, HTTPExtractor, LocalFileExtractor, FTPExtractor
|
|
2
|
+
from .transformers import JSONTransformer, CSVTransformer, Transformer
|
|
3
|
+
from .loaders import HydroServerLoader, Loader
|
|
4
|
+
|
|
5
|
+
from .etl_configuration import EtlConfiguration
|
|
6
|
+
from .schedule import Schedule
|
|
7
|
+
from .status import Status
|
|
8
|
+
from .orchestration_system import OrchestrationSystem
|
|
9
|
+
from .data_source import DataSource
|
|
10
|
+
|
|
11
|
+
__all__ = [
|
|
12
|
+
"CSVTransformer",
|
|
13
|
+
"JSONTransformer",
|
|
14
|
+
"LocalFileExtractor",
|
|
15
|
+
"FTPExtractor",
|
|
16
|
+
"HTTPExtractor",
|
|
17
|
+
"Extractor",
|
|
18
|
+
"Transformer",
|
|
19
|
+
"Loader",
|
|
20
|
+
"HydroServerLoader",
|
|
21
|
+
"EtlConfiguration",
|
|
22
|
+
"Schedule",
|
|
23
|
+
"Status",
|
|
24
|
+
"OrchestrationSystem",
|
|
25
|
+
"DataSource",
|
|
26
|
+
]
|
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from datetime import datetime, timedelta, timezone
|
|
3
|
+
from functools import cached_property
|
|
4
|
+
import logging
|
|
5
|
+
import uuid
|
|
6
|
+
from typing import ClassVar, TYPE_CHECKING, List, Optional, Union
|
|
7
|
+
import croniter
|
|
8
|
+
import pandas as pd
|
|
9
|
+
from pydantic import Field
|
|
10
|
+
|
|
11
|
+
from ..base import HydroServerBaseModel
|
|
12
|
+
from ..sta.datastream import Datastream
|
|
13
|
+
from .orchestration_system import OrchestrationSystem
|
|
14
|
+
from .etl_configuration import EtlConfiguration
|
|
15
|
+
from .schedule import Schedule
|
|
16
|
+
from .status import Status
|
|
17
|
+
from .factories import extractor_factory, transformer_factory, loader_factory
|
|
18
|
+
from .loaders import HydroServerLoader
|
|
19
|
+
|
|
20
|
+
if TYPE_CHECKING:
|
|
21
|
+
from hydroserverpy import HydroServer
|
|
22
|
+
from hydroserverpy.api.models import Workspace
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class DataSource(HydroServerBaseModel):
|
|
26
|
+
name: str = Field(..., max_length=255)
|
|
27
|
+
settings: EtlConfiguration
|
|
28
|
+
orchestration_system_id: uuid.UUID
|
|
29
|
+
schedule: Schedule
|
|
30
|
+
status: Status
|
|
31
|
+
workspace_id: uuid.UUID
|
|
32
|
+
|
|
33
|
+
_editable_fields: ClassVar[set[str]] = {
|
|
34
|
+
"name",
|
|
35
|
+
"settings",
|
|
36
|
+
"status",
|
|
37
|
+
"schedule",
|
|
38
|
+
"interval",
|
|
39
|
+
"interval_units",
|
|
40
|
+
"crontab",
|
|
41
|
+
"start_time",
|
|
42
|
+
"end_time",
|
|
43
|
+
"last_run_successful",
|
|
44
|
+
"last_run_message",
|
|
45
|
+
"last_run",
|
|
46
|
+
"next_run",
|
|
47
|
+
"paused",
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
def __init__(self, client: HydroServer, **data):
|
|
51
|
+
super().__init__(client=client, service=client.datasources, **data)
|
|
52
|
+
|
|
53
|
+
@classmethod
|
|
54
|
+
def get_route(cls):
|
|
55
|
+
return "data-sources"
|
|
56
|
+
|
|
57
|
+
@cached_property
|
|
58
|
+
def workspace(self) -> Workspace:
|
|
59
|
+
return self.client.workspaces.get(uid=self.workspace_id)
|
|
60
|
+
|
|
61
|
+
@cached_property
|
|
62
|
+
def orchestration_system(self) -> OrchestrationSystem:
|
|
63
|
+
return self.client.orchestrationsystems.get(uid=self.orchestration_system_id)
|
|
64
|
+
|
|
65
|
+
@cached_property
|
|
66
|
+
def datastreams(self) -> List[Datastream]:
|
|
67
|
+
return self.client.datastreams.list(data_source=self.uid, fetch_all=True).items
|
|
68
|
+
|
|
69
|
+
# TODO: Add functions like add_payload, add_mapping, etc. and don't allow the user to manually
|
|
70
|
+
# link or unlink datastreams - handle that automatically.
|
|
71
|
+
def add_datastream(self, datastream: Union["Datastream", uuid.UUID, str]):
|
|
72
|
+
"""Add a datastream to this data source."""
|
|
73
|
+
|
|
74
|
+
self.client.datasources.add_datastream(uid=self.uid, datastream=datastream)
|
|
75
|
+
|
|
76
|
+
def remove_datastream(self, datastream: Union["Datastream", uuid.UUID, str]):
|
|
77
|
+
"""Remove a datastream from this data source."""
|
|
78
|
+
|
|
79
|
+
self.client.datasources.remove_datastream(uid=self.uid, datastream=datastream)
|
|
80
|
+
|
|
81
|
+
def _next_run(self) -> Optional[str]:
|
|
82
|
+
now = datetime.now(timezone.utc)
|
|
83
|
+
if cron := self.schedule.crontab:
|
|
84
|
+
return croniter.croniter(cron, now).get_next(datetime).isoformat()
|
|
85
|
+
if iv := self.schedule.interval:
|
|
86
|
+
unit = self.schedule.interval_units or "minutes"
|
|
87
|
+
return (now + timedelta(**{unit: iv})).isoformat()
|
|
88
|
+
return None
|
|
89
|
+
|
|
90
|
+
def _update_status(self, loader: HydroServerLoader, success: bool, msg: str):
|
|
91
|
+
short_msg = msg if len(msg) <= 255 else msg[:252] + "…"
|
|
92
|
+
loader.client.datasources.update(
|
|
93
|
+
uid=self.uid,
|
|
94
|
+
last_run=datetime.now(timezone.utc).isoformat(),
|
|
95
|
+
last_run_successful=success,
|
|
96
|
+
last_run_message=short_msg,
|
|
97
|
+
next_run=self._next_run(),
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
def is_empty(self, data):
|
|
101
|
+
if data is None:
|
|
102
|
+
return True
|
|
103
|
+
if isinstance(data, pd.DataFrame) and data.empty:
|
|
104
|
+
return True
|
|
105
|
+
return False
|
|
106
|
+
|
|
107
|
+
def load_data(self, payload_name: str = None):
|
|
108
|
+
"""Load data for this data source."""
|
|
109
|
+
if self.status.paused is True:
|
|
110
|
+
return
|
|
111
|
+
|
|
112
|
+
if payload_name:
|
|
113
|
+
self.load_data_for_payload(payload_name)
|
|
114
|
+
else:
|
|
115
|
+
for p in self.settings.payloads:
|
|
116
|
+
self.load_data_for_payload(p.name)
|
|
117
|
+
|
|
118
|
+
def load_data_for_payload(self, payload_name: str):
|
|
119
|
+
payload = next(p for p in self.settings.payloads if p.name == payload_name)
|
|
120
|
+
|
|
121
|
+
extractor_cls = extractor_factory(self.settings.extractor)
|
|
122
|
+
transformer_cls = transformer_factory(self.settings.transformer)
|
|
123
|
+
loader_cls = loader_factory(self.settings.loader, self.client, self.uid)
|
|
124
|
+
|
|
125
|
+
try:
|
|
126
|
+
logging.info("Starting extract")
|
|
127
|
+
data = extractor_cls.extract(payload, loader_cls)
|
|
128
|
+
if self.is_empty(data):
|
|
129
|
+
self._update_status(
|
|
130
|
+
loader_cls, True, "No data returned from the extractor"
|
|
131
|
+
)
|
|
132
|
+
return
|
|
133
|
+
|
|
134
|
+
logging.info("Starting transform")
|
|
135
|
+
data = transformer_cls.transform(data, payload.mappings)
|
|
136
|
+
if self.is_empty(data):
|
|
137
|
+
self._update_status(
|
|
138
|
+
loader_cls, True, "No data returned from the transformer"
|
|
139
|
+
)
|
|
140
|
+
return
|
|
141
|
+
|
|
142
|
+
logging.info("Starting load")
|
|
143
|
+
loader_cls.load(data, payload)
|
|
144
|
+
self._update_status(loader_cls, True, "OK")
|
|
145
|
+
except Exception as e:
|
|
146
|
+
self._update_status(loader_cls, False, str(e))
|
|
@@ -0,0 +1,224 @@
|
|
|
1
|
+
from typing import Annotated, Dict, List, Literal, Optional, Union
|
|
2
|
+
from pydantic import BaseModel, Field, field_validator
|
|
3
|
+
from enum import Enum
|
|
4
|
+
|
|
5
|
+
WorkflowType = Literal["ETL", "Aggregation", "Virtual", "SDL"]
|
|
6
|
+
CSVDelimiterType = Literal[",", "|", "\t", ";", " "]
|
|
7
|
+
ExtractorType = Literal["HTTP", "local"]
|
|
8
|
+
TransformerType = Literal["JSON", "CSV"]
|
|
9
|
+
LoaderType = Literal["HydroServer"]
|
|
10
|
+
IdentifierType = Literal["name", "index"]
|
|
11
|
+
RunTimeValue = Literal["jobExecutionTime", "latestObservationTimestamp"]
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class FixedOffsetTimezone(str, Enum):
|
|
15
|
+
UTC_MINUS_1200 = "-1200"
|
|
16
|
+
UTC_MINUS_1100 = "-1100"
|
|
17
|
+
UTC_MINUS_1000 = "-1000"
|
|
18
|
+
UTC_MINUS_0900 = "-0900"
|
|
19
|
+
UTC_MINUS_0800 = "-0800"
|
|
20
|
+
UTC_MINUS_0700 = "-0700"
|
|
21
|
+
UTC_MINUS_0600 = "-0600"
|
|
22
|
+
UTC_MINUS_0500 = "-0500"
|
|
23
|
+
UTC_MINUS_0430 = "-0430"
|
|
24
|
+
UTC_MINUS_0400 = "-0400"
|
|
25
|
+
UTC_MINUS_0330 = "-0330"
|
|
26
|
+
UTC_MINUS_0300 = "-0300"
|
|
27
|
+
UTC_MINUS_0200 = "-0200"
|
|
28
|
+
UTC_MINUS_0100 = "-0100"
|
|
29
|
+
UTC_PLUS_0000 = "+0000"
|
|
30
|
+
UTC_PLUS_0100 = "+0100"
|
|
31
|
+
UTC_PLUS_0200 = "+0200"
|
|
32
|
+
UTC_PLUS_0300 = "+0300"
|
|
33
|
+
UTC_PLUS_0330 = "+0330"
|
|
34
|
+
UTC_PLUS_0400 = "+0400"
|
|
35
|
+
UTC_PLUS_0430 = "+0430"
|
|
36
|
+
UTC_PLUS_0500 = "+0500"
|
|
37
|
+
UTC_PLUS_0530 = "+0530"
|
|
38
|
+
UTC_PLUS_0545 = "+0545"
|
|
39
|
+
UTC_PLUS_0600 = "+0600"
|
|
40
|
+
UTC_PLUS_0630 = "+0630"
|
|
41
|
+
UTC_PLUS_0700 = "+0700"
|
|
42
|
+
UTC_PLUS_0800 = "+0800"
|
|
43
|
+
UTC_PLUS_0845 = "+0845"
|
|
44
|
+
UTC_PLUS_0900 = "+0900"
|
|
45
|
+
UTC_PLUS_0930 = "+0930"
|
|
46
|
+
UTC_PLUS_1000 = "+1000"
|
|
47
|
+
UTC_PLUS_1030 = "+1030"
|
|
48
|
+
UTC_PLUS_1100 = "+1100"
|
|
49
|
+
UTC_PLUS_1130 = "+1130"
|
|
50
|
+
UTC_PLUS_1200 = "+1200"
|
|
51
|
+
UTC_PLUS_1245 = "+1245"
|
|
52
|
+
UTC_PLUS_1300 = "+1300"
|
|
53
|
+
UTC_PLUS_1400 = "+1400"
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class TimestampFormat(str, Enum):
|
|
57
|
+
ISO8601 = "ISO8601"
|
|
58
|
+
naive = "naive"
|
|
59
|
+
custom = "custom"
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class TimezoneMode(str, Enum):
|
|
63
|
+
utc = "utc" # always UTC
|
|
64
|
+
daylightSavings = "daylightSavings" # IANA / DST-aware
|
|
65
|
+
fixedOffset = "fixedOffset" # constant offset
|
|
66
|
+
embeddedOffset = "embeddedOffset" # offset in ISO string
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
class Timestamp(BaseModel):
|
|
70
|
+
key: Optional[str] = None
|
|
71
|
+
format: TimestampFormat
|
|
72
|
+
custom_format: Optional[str] = Field(None, alias="customFormat")
|
|
73
|
+
timezone_mode: TimezoneMode = Field(..., alias="timezoneMode")
|
|
74
|
+
timezone: Optional[Union[FixedOffsetTimezone, str]] = Field(None, alias="timezone")
|
|
75
|
+
|
|
76
|
+
class Config:
|
|
77
|
+
allow_population_by_field_name = True
|
|
78
|
+
|
|
79
|
+
@field_validator("timezone")
|
|
80
|
+
def check_timezone(cls, timezone_value, info):
|
|
81
|
+
mode = info.data.get("timezone_mode")
|
|
82
|
+
if mode == TimezoneMode.fixedOffset and timezone_value is None:
|
|
83
|
+
raise ValueError("`timezone` must be set when timezoneMode is fixedOffset")
|
|
84
|
+
return timezone_value
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
class PerPayloadPlaceholder(BaseModel):
|
|
88
|
+
name: str
|
|
89
|
+
type: Literal["perPayload"]
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
class RunTimePlaceholder(BaseModel):
|
|
93
|
+
name: str
|
|
94
|
+
type: Literal["runTime"]
|
|
95
|
+
run_time_value: RunTimeValue = Field(..., alias="runTimeValue")
|
|
96
|
+
timestamp: Timestamp
|
|
97
|
+
|
|
98
|
+
class Config:
|
|
99
|
+
allow_population_by_field_name = True
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
PlaceholderVariable = Annotated[
|
|
103
|
+
Union[PerPayloadPlaceholder, RunTimePlaceholder],
|
|
104
|
+
Field(discriminator="type"),
|
|
105
|
+
]
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
class BaseExtractor(BaseModel):
|
|
109
|
+
type: ExtractorType
|
|
110
|
+
source_uri: str = Field(..., alias="sourceUri")
|
|
111
|
+
placeholder_variables: Optional[List[PlaceholderVariable]] = Field(
|
|
112
|
+
default_factory=list,
|
|
113
|
+
alias="placeholderVariables",
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
class Config:
|
|
117
|
+
allow_population_by_field_name = True
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
class HTTPExtractor(BaseExtractor):
|
|
121
|
+
type: Literal["HTTP"]
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
class LocalFileExtractor(BaseExtractor):
|
|
125
|
+
type: Literal["local"]
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
ExtractorConfig = Annotated[
|
|
129
|
+
Union[HTTPExtractor, LocalFileExtractor], Field(discriminator="type")
|
|
130
|
+
]
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
class BaseTransformer(BaseModel):
|
|
134
|
+
type: TransformerType
|
|
135
|
+
timestamp: Timestamp
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
class JSONTransformer(BaseTransformer):
|
|
139
|
+
type: Literal["JSON"]
|
|
140
|
+
jmespath: str = Field(..., alias="JMESPath")
|
|
141
|
+
|
|
142
|
+
class Config:
|
|
143
|
+
allow_population_by_field_name = True
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
class CSVTransformer(BaseTransformer):
|
|
147
|
+
type: Literal["CSV"]
|
|
148
|
+
header_row: Optional[int] = Field(..., alias="headerRow")
|
|
149
|
+
data_start_row: int = Field(..., alias="dataStartRow")
|
|
150
|
+
delimiter: CSVDelimiterType
|
|
151
|
+
identifier_type: IdentifierType = Field(..., alias="identifierType")
|
|
152
|
+
|
|
153
|
+
class Config:
|
|
154
|
+
allow_population_by_field_name = True
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
TransformerConfig = Union[JSONTransformer, CSVTransformer]
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
class BaseLoaderConfig(BaseModel):
|
|
161
|
+
type: LoaderType
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
class HydroServerLoaderConfig(BaseLoaderConfig):
|
|
165
|
+
type: Literal["HydroServer"]
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
LoaderConfig = HydroServerLoaderConfig
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
class ExpressionDataTransformation(BaseModel):
|
|
172
|
+
type: Literal["expression"]
|
|
173
|
+
expression: str
|
|
174
|
+
|
|
175
|
+
class Config:
|
|
176
|
+
allow_population_by_field_name = True
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
class LookupTableDataTransformation(BaseModel):
|
|
180
|
+
type: Literal["lookup"]
|
|
181
|
+
lookup_table_id: str = Field(..., alias="lookupTableId")
|
|
182
|
+
|
|
183
|
+
class Config:
|
|
184
|
+
allow_population_by_field_name = True
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
DataTransformation = Union[ExpressionDataTransformation, LookupTableDataTransformation]
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
class MappingPath(BaseModel):
|
|
191
|
+
target_identifier: Union[str, int] = Field(..., alias="targetIdentifier")
|
|
192
|
+
data_transformations: List[DataTransformation] = Field(
|
|
193
|
+
default_factory=list, alias="dataTransformations"
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
class Config:
|
|
197
|
+
allow_population_by_field_name = True
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
class SourceTargetMapping(BaseModel):
|
|
201
|
+
source_identifier: Union[str, int] = Field(..., alias="sourceIdentifier")
|
|
202
|
+
paths: List[MappingPath] = Field(default_factory=list)
|
|
203
|
+
|
|
204
|
+
class Config:
|
|
205
|
+
allow_population_by_field_name = True
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
class Payload(BaseModel):
|
|
209
|
+
name: str = ""
|
|
210
|
+
mappings: List[SourceTargetMapping] = Field(default_factory=list)
|
|
211
|
+
extractor_variables: Dict[str, str] = Field(
|
|
212
|
+
default_factory=dict, alias="extractorVariables"
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
class Config:
|
|
216
|
+
allow_population_by_field_name = True
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
class EtlConfiguration(BaseModel):
|
|
220
|
+
type: WorkflowType
|
|
221
|
+
extractor: ExtractorConfig
|
|
222
|
+
transformer: TransformerConfig
|
|
223
|
+
loader: LoaderConfig
|
|
224
|
+
payloads: List[Payload]
|
|
@@ -2,49 +2,46 @@ from abc import abstractmethod
|
|
|
2
2
|
import logging
|
|
3
3
|
import pandas as pd
|
|
4
4
|
from datetime import datetime
|
|
5
|
-
|
|
6
|
-
from
|
|
5
|
+
from ..etl_configuration import ExtractorConfig, Payload
|
|
6
|
+
from ..timestamp_parser import TimestampParser
|
|
7
7
|
|
|
8
8
|
|
|
9
9
|
class Extractor:
|
|
10
|
-
def __init__(self,
|
|
11
|
-
self.
|
|
12
|
-
self.source_uri = settings["sourceUri"]
|
|
10
|
+
def __init__(self, extractor_config: ExtractorConfig):
|
|
11
|
+
self.cfg = extractor_config
|
|
13
12
|
|
|
14
|
-
def resolve_placeholder_variables(self, payload, loader):
|
|
13
|
+
def resolve_placeholder_variables(self, payload: Payload, loader):
|
|
15
14
|
logging.info(f"Creating runtime variables...")
|
|
16
15
|
filled = {}
|
|
17
|
-
for
|
|
18
|
-
name =
|
|
19
|
-
var_type = var.get("type", None)
|
|
16
|
+
for placeholder in self.cfg.placeholder_variables:
|
|
17
|
+
name = placeholder.name
|
|
20
18
|
|
|
21
|
-
if
|
|
19
|
+
if placeholder.type == "runTime":
|
|
22
20
|
logging.info(f"Resolving runtime var: {name}")
|
|
23
|
-
if
|
|
21
|
+
if placeholder.run_time_value == "latestObservationTimestamp":
|
|
24
22
|
value = loader.earliest_begin_date(payload)
|
|
25
|
-
elif
|
|
23
|
+
elif placeholder.run_time_value == "jobExecutionTime":
|
|
26
24
|
value = pd.Timestamp.now(tz="UTC")
|
|
27
|
-
elif
|
|
25
|
+
elif placeholder.type == "perPayload":
|
|
28
26
|
logging.info(f"Resolving payload var: {name}")
|
|
29
|
-
|
|
30
|
-
if name not in payload_vars:
|
|
27
|
+
if name not in payload.extractor_variables:
|
|
31
28
|
raise KeyError(f"Missing per-payload variable '{name}'")
|
|
32
|
-
value =
|
|
29
|
+
value = payload.extractor_variables[name]
|
|
33
30
|
else:
|
|
34
31
|
continue
|
|
35
32
|
|
|
36
33
|
if isinstance(value, (datetime, pd.Timestamp)):
|
|
37
|
-
parser = TimestampParser(
|
|
34
|
+
parser = TimestampParser(placeholder.timestamp)
|
|
38
35
|
value = parser.utc_to_string(value)
|
|
39
36
|
|
|
40
37
|
filled[name] = value
|
|
41
38
|
if not filled:
|
|
42
|
-
return self.source_uri
|
|
39
|
+
return self.cfg.source_uri
|
|
43
40
|
return self.format_uri(filled)
|
|
44
41
|
|
|
45
42
|
def format_uri(self, placeholder_variables):
|
|
46
43
|
try:
|
|
47
|
-
uri = self.source_uri.format(**placeholder_variables)
|
|
44
|
+
uri = self.cfg.source_uri.format(**placeholder_variables)
|
|
48
45
|
except KeyError as e:
|
|
49
46
|
missing_key = e.args[0]
|
|
50
47
|
raise KeyError(f"Missing placeholder variable: {missing_key}")
|
|
@@ -1,14 +1,16 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
import requests
|
|
3
3
|
from io import BytesIO
|
|
4
|
-
|
|
4
|
+
|
|
5
|
+
from ..etl_configuration import Payload
|
|
6
|
+
from .base import Extractor, ExtractorConfig
|
|
5
7
|
|
|
6
8
|
|
|
7
9
|
class HTTPExtractor(Extractor):
|
|
8
|
-
def __init__(self, settings:
|
|
10
|
+
def __init__(self, settings: ExtractorConfig):
|
|
9
11
|
super().__init__(settings)
|
|
10
12
|
|
|
11
|
-
def extract(self, payload, loader=None):
|
|
13
|
+
def extract(self, payload: Payload, loader=None):
|
|
12
14
|
"""
|
|
13
15
|
Downloads the file from the HTTP/HTTPS server and returns a file-like object.
|
|
14
16
|
"""
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from .base import Extractor
|
|
3
|
+
from ..etl_configuration import ExtractorConfig
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class LocalFileExtractor(Extractor):
|
|
7
|
+
def __init__(self, extractor_config: ExtractorConfig):
|
|
8
|
+
super().__init__(extractor_config)
|
|
9
|
+
|
|
10
|
+
def extract(self):
|
|
11
|
+
"""
|
|
12
|
+
Opens the file and returns a file-like object.
|
|
13
|
+
"""
|
|
14
|
+
try:
|
|
15
|
+
file_handle = open(self.cfg.source_uri, "r")
|
|
16
|
+
logging.info(f"Successfully opened file '{self.cfg.source_uri}'.")
|
|
17
|
+
return file_handle
|
|
18
|
+
except Exception as e:
|
|
19
|
+
logging.error(f"Error opening file '{self.cfg.source_uri}': {e}")
|
|
20
|
+
return None
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
from .extractors import HTTPExtractor, LocalFileExtractor
|
|
2
|
+
from .transformers import JSONTransformer, CSVTransformer
|
|
3
|
+
from .loaders import HydroServerLoader
|
|
4
|
+
from .etl_configuration import ExtractorConfig, TransformerConfig, LoaderConfig
|
|
5
|
+
|
|
6
|
+
EXTRACTORS = {"HTTP": HTTPExtractor, "local": LocalFileExtractor}
|
|
7
|
+
TRANSFORMERS = {"JSON": JSONTransformer, "CSV": CSVTransformer}
|
|
8
|
+
LOADERS = {"HydroServer": HydroServerLoader}
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def extractor_factory(settings: ExtractorConfig):
|
|
12
|
+
cls = EXTRACTORS[settings.type]
|
|
13
|
+
return cls(settings)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def transformer_factory(settings: TransformerConfig):
|
|
17
|
+
cls = TRANSFORMERS[settings.type]
|
|
18
|
+
return cls(settings)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def loader_factory(settings: LoaderConfig, auth_context, data_source_id: str):
|
|
22
|
+
cls = LOADERS[settings.type]
|
|
23
|
+
return cls(auth_context, data_source_id)
|