hydroserverpy 1.3.0b3__py3-none-any.whl → 1.4.0b3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hydroserverpy might be problematic. Click here for more details.

Files changed (39) hide show
  1. hydroserverpy/__init__.py +0 -2
  2. hydroserverpy/api/models/etl/__init__.py +26 -0
  3. hydroserverpy/api/models/etl/data_source.py +107 -72
  4. hydroserverpy/api/models/etl/etl_configuration.py +224 -0
  5. hydroserverpy/api/models/etl/extractors/__init__.py +6 -0
  6. hydroserverpy/{etl → api/models/etl}/extractors/base.py +16 -19
  7. hydroserverpy/{etl → api/models/etl}/extractors/http_extractor.py +5 -3
  8. hydroserverpy/api/models/etl/extractors/local_file_extractor.py +20 -0
  9. hydroserverpy/api/models/etl/factories.py +23 -0
  10. hydroserverpy/api/models/etl/loaders/__init__.py +4 -0
  11. hydroserverpy/{etl → api/models/etl}/loaders/base.py +0 -2
  12. hydroserverpy/api/models/etl/loaders/hydroserver_loader.py +100 -0
  13. hydroserverpy/api/models/etl/schedule.py +16 -0
  14. hydroserverpy/api/models/etl/status.py +14 -0
  15. hydroserverpy/api/models/etl/transformers/__init__.py +5 -0
  16. hydroserverpy/api/models/etl/transformers/base.py +128 -0
  17. hydroserverpy/{etl → api/models/etl}/transformers/csv_transformer.py +24 -13
  18. hydroserverpy/{etl → api/models/etl}/transformers/json_transformer.py +7 -6
  19. hydroserverpy/api/services/etl/data_source.py +1 -4
  20. {hydroserverpy-1.3.0b3.dist-info → hydroserverpy-1.4.0b3.dist-info}/METADATA +1 -1
  21. {hydroserverpy-1.3.0b3.dist-info → hydroserverpy-1.4.0b3.dist-info}/RECORD +28 -29
  22. hydroserverpy/etl/__init__.py +0 -21
  23. hydroserverpy/etl/extractors/__init__.py +0 -0
  24. hydroserverpy/etl/extractors/local_file_extractor.py +0 -19
  25. hydroserverpy/etl/hydroserver_etl.py +0 -40
  26. hydroserverpy/etl/loaders/__init__.py +0 -0
  27. hydroserverpy/etl/loaders/hydroserver_loader.py +0 -71
  28. hydroserverpy/etl/transformers/__init__.py +0 -0
  29. hydroserverpy/etl/transformers/base.py +0 -64
  30. hydroserverpy/etl_csv/__init__.py +0 -0
  31. hydroserverpy/etl_csv/exceptions.py +0 -14
  32. hydroserverpy/etl_csv/hydroserver_etl_csv.py +0 -346
  33. /hydroserverpy/{etl → api/models/etl}/extractors/ftp_extractor.py +0 -0
  34. /hydroserverpy/{etl → api/models/etl}/timestamp_parser.py +0 -0
  35. /hydroserverpy/{etl → api/models/etl}/types.py +0 -0
  36. {hydroserverpy-1.3.0b3.dist-info → hydroserverpy-1.4.0b3.dist-info}/WHEEL +0 -0
  37. {hydroserverpy-1.3.0b3.dist-info → hydroserverpy-1.4.0b3.dist-info}/licenses/LICENSE +0 -0
  38. {hydroserverpy-1.3.0b3.dist-info → hydroserverpy-1.4.0b3.dist-info}/top_level.txt +0 -0
  39. {hydroserverpy-1.3.0b3.dist-info → hydroserverpy-1.4.0b3.dist-info}/zip-safe +0 -0
hydroserverpy/__init__.py CHANGED
@@ -1,9 +1,7 @@
1
1
  from .api.client import HydroServer
2
- from .etl.hydroserver_etl import HydroServerETL
3
2
  from .quality import HydroServerQualityControl
4
3
 
5
4
  __all__ = [
6
5
  "HydroServer",
7
6
  "HydroServerQualityControl",
8
- "HydroServerETL",
9
7
  ]
@@ -0,0 +1,26 @@
1
+ from .extractors import Extractor, HTTPExtractor, LocalFileExtractor, FTPExtractor
2
+ from .transformers import JSONTransformer, CSVTransformer, Transformer
3
+ from .loaders import HydroServerLoader, Loader
4
+
5
+ from .etl_configuration import EtlConfiguration
6
+ from .schedule import Schedule
7
+ from .status import Status
8
+ from .orchestration_system import OrchestrationSystem
9
+ from .data_source import DataSource
10
+
11
+ __all__ = [
12
+ "CSVTransformer",
13
+ "JSONTransformer",
14
+ "LocalFileExtractor",
15
+ "FTPExtractor",
16
+ "HTTPExtractor",
17
+ "Extractor",
18
+ "Transformer",
19
+ "Loader",
20
+ "HydroServerLoader",
21
+ "EtlConfiguration",
22
+ "Schedule",
23
+ "Status",
24
+ "OrchestrationSystem",
25
+ "DataSource",
26
+ ]
@@ -1,111 +1,146 @@
1
+ from __future__ import annotations
2
+ from datetime import datetime, timedelta, timezone
3
+ from functools import cached_property
4
+ import logging
1
5
  import uuid
2
- import tempfile
3
- import requests
4
- from typing import Union, ClassVar, Optional, TYPE_CHECKING, List
6
+ from typing import ClassVar, TYPE_CHECKING, List, Optional, Union
7
+ import croniter
8
+ import pandas as pd
5
9
  from pydantic import Field
6
- from hydroserverpy.etl_csv.hydroserver_etl_csv import HydroServerETLCSV
7
- from .orchestration_system import OrchestrationSystem
8
- from .orchestration_configuration import OrchestrationConfigurationFields
9
- from ..sta.datastream import Datastream
10
+
10
11
  from ..base import HydroServerBaseModel
12
+ from ..sta.datastream import Datastream
13
+ from .orchestration_system import OrchestrationSystem
14
+ from .etl_configuration import EtlConfiguration
15
+ from .schedule import Schedule
16
+ from .status import Status
17
+ from .factories import extractor_factory, transformer_factory, loader_factory
18
+ from .loaders import HydroServerLoader
11
19
 
12
20
  if TYPE_CHECKING:
13
21
  from hydroserverpy import HydroServer
14
22
  from hydroserverpy.api.models import Workspace
15
23
 
16
24
 
17
- class DataSource(
18
- HydroServerBaseModel, OrchestrationConfigurationFields
19
- ):
25
+ class DataSource(HydroServerBaseModel):
20
26
  name: str = Field(..., max_length=255)
21
- settings: Optional[dict] = None
27
+ settings: EtlConfiguration
22
28
  orchestration_system_id: uuid.UUID
29
+ schedule: Schedule
30
+ status: Status
23
31
  workspace_id: uuid.UUID
24
32
 
25
33
  _editable_fields: ClassVar[set[str]] = {
26
- "name", "settings", "interval", "interval_units", "crontab", "start_time", "end_time", "last_run_successful",
27
- "last_run_message", "last_run", "next_run", "paused"
34
+ "name",
35
+ "settings",
36
+ "status",
37
+ "schedule",
38
+ "interval",
39
+ "interval_units",
40
+ "crontab",
41
+ "start_time",
42
+ "end_time",
43
+ "last_run_successful",
44
+ "last_run_message",
45
+ "last_run",
46
+ "next_run",
47
+ "paused",
28
48
  }
29
49
 
30
- def __init__(self, client: "HydroServer", **data):
50
+ def __init__(self, client: HydroServer, **data):
31
51
  super().__init__(client=client, service=client.datasources, **data)
32
52
 
33
- self._workspace = None
34
- self._orchestration_system = None
35
- self._datastreams = None
36
-
37
53
  @classmethod
38
54
  def get_route(cls):
39
55
  return "data-sources"
40
56
 
41
- @property
42
- def workspace(self) -> "Workspace":
43
- """The workspace this data source belongs to."""
44
-
45
- if self._workspace is None:
46
- self._workspace = self.client.workspaces.get(uid=self.workspace_id)
47
-
48
- return self._workspace
49
-
50
- @property
51
- def orchestration_system(self) -> "OrchestrationSystem":
52
- """The orchestration system that manages this data source."""
57
+ @cached_property
58
+ def workspace(self) -> Workspace:
59
+ return self.client.workspaces.get(uid=self.workspace_id)
53
60
 
54
- if self._orchestration_system is None:
55
- self._orchestration_system = self.client.orchestrationsystems.get(uid=self.orchestration_system_id)
61
+ @cached_property
62
+ def orchestration_system(self) -> OrchestrationSystem:
63
+ return self.client.orchestrationsystems.get(uid=self.orchestration_system_id)
56
64
 
57
- return self._orchestration_system
58
-
59
- @property
60
- def datastreams(self) -> List["Datastream"]:
61
- """The datastreams this data source provides data for."""
62
-
63
- if self._datastreams is None:
64
- self._datastreams = self.client.datastreams.list(data_source=self.uid, fetch_all=True).items
65
-
66
- return self._datastreams
65
+ @cached_property
66
+ def datastreams(self) -> List[Datastream]:
67
+ return self.client.datastreams.list(data_source=self.uid, fetch_all=True).items
67
68
 
69
+ # TODO: Add functions like add_payload, add_mapping, etc. and don't allow the user to manually
70
+ # link or unlink datastreams - handle that automatically.
68
71
  def add_datastream(self, datastream: Union["Datastream", uuid.UUID, str]):
69
72
  """Add a datastream to this data source."""
70
73
 
71
- self.client.datasources.add_datastream(
72
- uid=self.uid, datastream=datastream
73
- )
74
+ self.client.datasources.add_datastream(uid=self.uid, datastream=datastream)
74
75
 
75
76
  def remove_datastream(self, datastream: Union["Datastream", uuid.UUID, str]):
76
77
  """Remove a datastream from this data source."""
77
78
 
78
- self.client.datasources.remove_datastream(
79
- uid=self.uid, datastream=datastream
79
+ self.client.datasources.remove_datastream(uid=self.uid, datastream=datastream)
80
+
81
+ def _next_run(self) -> Optional[str]:
82
+ now = datetime.now(timezone.utc)
83
+ if cron := self.schedule.crontab:
84
+ return croniter.croniter(cron, now).get_next(datetime).isoformat()
85
+ if iv := self.schedule.interval:
86
+ unit = self.schedule.interval_units or "minutes"
87
+ return (now + timedelta(**{unit: iv})).isoformat()
88
+ return None
89
+
90
+ def _update_status(self, loader: HydroServerLoader, success: bool, msg: str):
91
+ short_msg = msg if len(msg) <= 255 else msg[:252] + "…"
92
+ loader.client.datasources.update(
93
+ uid=self.uid,
94
+ last_run=datetime.now(timezone.utc).isoformat(),
95
+ last_run_successful=success,
96
+ last_run_message=short_msg,
97
+ next_run=self._next_run(),
80
98
  )
81
99
 
82
- # TODO: Replace with ETL module.
83
- def load_data(self):
84
- """Load data for this data source."""
100
+ def is_empty(self, data):
101
+ if data is None:
102
+ return True
103
+ if isinstance(data, pd.DataFrame) and data.empty:
104
+ return True
105
+ return False
85
106
 
86
- if self.paused is True:
107
+ def load_data(self, payload_name: str = None):
108
+ """Load data for this data source."""
109
+ if self.status.paused is True:
87
110
  return
88
111
 
89
- if self.settings["extractor"]["type"] == "local":
90
- with open(self.settings["extractor"]["sourceUri"]) as data_file:
91
- loader = HydroServerETLCSV(
92
- self.client, data_file=data_file, data_source=self
93
- )
94
- loader.run()
95
- elif self.settings["extractor"]["type"] == "HTTP":
96
- with tempfile.NamedTemporaryFile(mode="w+") as temp_file:
97
- response = requests.get(
98
- self.settings["extractor"]["sourceUri"],
99
- stream=True,
100
- timeout=60,
112
+ if payload_name:
113
+ self.load_data_for_payload(payload_name)
114
+ else:
115
+ for p in self.settings.payloads:
116
+ self.load_data_for_payload(p.name)
117
+
118
+ def load_data_for_payload(self, payload_name: str):
119
+ payload = next(p for p in self.settings.payloads if p.name == payload_name)
120
+
121
+ extractor_cls = extractor_factory(self.settings.extractor)
122
+ transformer_cls = transformer_factory(self.settings.transformer)
123
+ loader_cls = loader_factory(self.settings.loader, self.client, self.uid)
124
+
125
+ try:
126
+ logging.info("Starting extract")
127
+ data = extractor_cls.extract(payload, loader_cls)
128
+ if self.is_empty(data):
129
+ self._update_status(
130
+ loader_cls, True, "No data returned from the extractor"
101
131
  )
102
- response.raise_for_status()
103
- chunk_size = 1024 * 1024 * 10 # Use a 10mb chunk size.
104
- for chunk in response.iter_content(chunk_size=chunk_size):
105
- if chunk:
106
- temp_file.write(chunk.decode("utf-8"))
107
- temp_file.seek(0)
108
- loader = HydroServerETLCSV(
109
- self.client, data_file=temp_file, data_source=self
132
+ return
133
+
134
+ logging.info("Starting transform")
135
+ data = transformer_cls.transform(data, payload.mappings)
136
+ if self.is_empty(data):
137
+ self._update_status(
138
+ loader_cls, True, "No data returned from the transformer"
110
139
  )
111
- loader.run()
140
+ return
141
+
142
+ logging.info("Starting load")
143
+ loader_cls.load(data, payload)
144
+ self._update_status(loader_cls, True, "OK")
145
+ except Exception as e:
146
+ self._update_status(loader_cls, False, str(e))
@@ -0,0 +1,224 @@
1
+ from typing import Annotated, Dict, List, Literal, Optional, Union
2
+ from pydantic import BaseModel, Field, field_validator
3
+ from enum import Enum
4
+
5
+ WorkflowType = Literal["ETL", "Aggregation", "Virtual", "SDL"]
6
+ CSVDelimiterType = Literal[",", "|", "\t", ";", " "]
7
+ ExtractorType = Literal["HTTP", "local"]
8
+ TransformerType = Literal["JSON", "CSV"]
9
+ LoaderType = Literal["HydroServer"]
10
+ IdentifierType = Literal["name", "index"]
11
+ RunTimeValue = Literal["jobExecutionTime", "latestObservationTimestamp"]
12
+
13
+
14
+ class FixedOffsetTimezone(str, Enum):
15
+ UTC_MINUS_1200 = "-1200"
16
+ UTC_MINUS_1100 = "-1100"
17
+ UTC_MINUS_1000 = "-1000"
18
+ UTC_MINUS_0900 = "-0900"
19
+ UTC_MINUS_0800 = "-0800"
20
+ UTC_MINUS_0700 = "-0700"
21
+ UTC_MINUS_0600 = "-0600"
22
+ UTC_MINUS_0500 = "-0500"
23
+ UTC_MINUS_0430 = "-0430"
24
+ UTC_MINUS_0400 = "-0400"
25
+ UTC_MINUS_0330 = "-0330"
26
+ UTC_MINUS_0300 = "-0300"
27
+ UTC_MINUS_0200 = "-0200"
28
+ UTC_MINUS_0100 = "-0100"
29
+ UTC_PLUS_0000 = "+0000"
30
+ UTC_PLUS_0100 = "+0100"
31
+ UTC_PLUS_0200 = "+0200"
32
+ UTC_PLUS_0300 = "+0300"
33
+ UTC_PLUS_0330 = "+0330"
34
+ UTC_PLUS_0400 = "+0400"
35
+ UTC_PLUS_0430 = "+0430"
36
+ UTC_PLUS_0500 = "+0500"
37
+ UTC_PLUS_0530 = "+0530"
38
+ UTC_PLUS_0545 = "+0545"
39
+ UTC_PLUS_0600 = "+0600"
40
+ UTC_PLUS_0630 = "+0630"
41
+ UTC_PLUS_0700 = "+0700"
42
+ UTC_PLUS_0800 = "+0800"
43
+ UTC_PLUS_0845 = "+0845"
44
+ UTC_PLUS_0900 = "+0900"
45
+ UTC_PLUS_0930 = "+0930"
46
+ UTC_PLUS_1000 = "+1000"
47
+ UTC_PLUS_1030 = "+1030"
48
+ UTC_PLUS_1100 = "+1100"
49
+ UTC_PLUS_1130 = "+1130"
50
+ UTC_PLUS_1200 = "+1200"
51
+ UTC_PLUS_1245 = "+1245"
52
+ UTC_PLUS_1300 = "+1300"
53
+ UTC_PLUS_1400 = "+1400"
54
+
55
+
56
+ class TimestampFormat(str, Enum):
57
+ ISO8601 = "ISO8601"
58
+ naive = "naive"
59
+ custom = "custom"
60
+
61
+
62
+ class TimezoneMode(str, Enum):
63
+ utc = "utc" # always UTC
64
+ daylightSavings = "daylightSavings" # IANA / DST-aware
65
+ fixedOffset = "fixedOffset" # constant offset
66
+ embeddedOffset = "embeddedOffset" # offset in ISO string
67
+
68
+
69
+ class Timestamp(BaseModel):
70
+ key: Optional[str] = None
71
+ format: TimestampFormat
72
+ custom_format: Optional[str] = Field(None, alias="customFormat")
73
+ timezone_mode: TimezoneMode = Field(..., alias="timezoneMode")
74
+ timezone: Optional[Union[FixedOffsetTimezone, str]] = Field(None, alias="timezone")
75
+
76
+ class Config:
77
+ allow_population_by_field_name = True
78
+
79
+ @field_validator("timezone")
80
+ def check_timezone(cls, timezone_value, info):
81
+ mode = info.data.get("timezone_mode")
82
+ if mode == TimezoneMode.fixedOffset and timezone_value is None:
83
+ raise ValueError("`timezone` must be set when timezoneMode is fixedOffset")
84
+ return timezone_value
85
+
86
+
87
+ class PerPayloadPlaceholder(BaseModel):
88
+ name: str
89
+ type: Literal["perPayload"]
90
+
91
+
92
+ class RunTimePlaceholder(BaseModel):
93
+ name: str
94
+ type: Literal["runTime"]
95
+ run_time_value: RunTimeValue = Field(..., alias="runTimeValue")
96
+ timestamp: Timestamp
97
+
98
+ class Config:
99
+ allow_population_by_field_name = True
100
+
101
+
102
+ PlaceholderVariable = Annotated[
103
+ Union[PerPayloadPlaceholder, RunTimePlaceholder],
104
+ Field(discriminator="type"),
105
+ ]
106
+
107
+
108
+ class BaseExtractor(BaseModel):
109
+ type: ExtractorType
110
+ source_uri: str = Field(..., alias="sourceUri")
111
+ placeholder_variables: Optional[List[PlaceholderVariable]] = Field(
112
+ default_factory=list,
113
+ alias="placeholderVariables",
114
+ )
115
+
116
+ class Config:
117
+ allow_population_by_field_name = True
118
+
119
+
120
+ class HTTPExtractor(BaseExtractor):
121
+ type: Literal["HTTP"]
122
+
123
+
124
+ class LocalFileExtractor(BaseExtractor):
125
+ type: Literal["local"]
126
+
127
+
128
+ ExtractorConfig = Annotated[
129
+ Union[HTTPExtractor, LocalFileExtractor], Field(discriminator="type")
130
+ ]
131
+
132
+
133
+ class BaseTransformer(BaseModel):
134
+ type: TransformerType
135
+ timestamp: Timestamp
136
+
137
+
138
+ class JSONTransformer(BaseTransformer):
139
+ type: Literal["JSON"]
140
+ jmespath: str = Field(..., alias="JMESPath")
141
+
142
+ class Config:
143
+ allow_population_by_field_name = True
144
+
145
+
146
+ class CSVTransformer(BaseTransformer):
147
+ type: Literal["CSV"]
148
+ header_row: Optional[int] = Field(..., alias="headerRow")
149
+ data_start_row: int = Field(..., alias="dataStartRow")
150
+ delimiter: CSVDelimiterType
151
+ identifier_type: IdentifierType = Field(..., alias="identifierType")
152
+
153
+ class Config:
154
+ allow_population_by_field_name = True
155
+
156
+
157
+ TransformerConfig = Union[JSONTransformer, CSVTransformer]
158
+
159
+
160
+ class BaseLoaderConfig(BaseModel):
161
+ type: LoaderType
162
+
163
+
164
+ class HydroServerLoaderConfig(BaseLoaderConfig):
165
+ type: Literal["HydroServer"]
166
+
167
+
168
+ LoaderConfig = HydroServerLoaderConfig
169
+
170
+
171
+ class ExpressionDataTransformation(BaseModel):
172
+ type: Literal["expression"]
173
+ expression: str
174
+
175
+ class Config:
176
+ allow_population_by_field_name = True
177
+
178
+
179
+ class LookupTableDataTransformation(BaseModel):
180
+ type: Literal["lookup"]
181
+ lookup_table_id: str = Field(..., alias="lookupTableId")
182
+
183
+ class Config:
184
+ allow_population_by_field_name = True
185
+
186
+
187
+ DataTransformation = Union[ExpressionDataTransformation, LookupTableDataTransformation]
188
+
189
+
190
+ class MappingPath(BaseModel):
191
+ target_identifier: Union[str, int] = Field(..., alias="targetIdentifier")
192
+ data_transformations: List[DataTransformation] = Field(
193
+ default_factory=list, alias="dataTransformations"
194
+ )
195
+
196
+ class Config:
197
+ allow_population_by_field_name = True
198
+
199
+
200
+ class SourceTargetMapping(BaseModel):
201
+ source_identifier: Union[str, int] = Field(..., alias="sourceIdentifier")
202
+ paths: List[MappingPath] = Field(default_factory=list)
203
+
204
+ class Config:
205
+ allow_population_by_field_name = True
206
+
207
+
208
+ class Payload(BaseModel):
209
+ name: str = ""
210
+ mappings: List[SourceTargetMapping] = Field(default_factory=list)
211
+ extractor_variables: Dict[str, str] = Field(
212
+ default_factory=dict, alias="extractorVariables"
213
+ )
214
+
215
+ class Config:
216
+ allow_population_by_field_name = True
217
+
218
+
219
+ class EtlConfiguration(BaseModel):
220
+ type: WorkflowType
221
+ extractor: ExtractorConfig
222
+ transformer: TransformerConfig
223
+ loader: LoaderConfig
224
+ payloads: List[Payload]
@@ -0,0 +1,6 @@
1
+ from .base import Extractor
2
+ from .ftp_extractor import FTPExtractor
3
+ from .http_extractor import HTTPExtractor
4
+ from .local_file_extractor import LocalFileExtractor
5
+
6
+ __all__ = ["Extractor", "HTTPExtractor", "LocalFileExtractor", "FTPExtractor"]
@@ -2,49 +2,46 @@ from abc import abstractmethod
2
2
  import logging
3
3
  import pandas as pd
4
4
  from datetime import datetime
5
-
6
- from hydroserverpy.etl.timestamp_parser import TimestampParser
5
+ from ..etl_configuration import ExtractorConfig, Payload
6
+ from ..timestamp_parser import TimestampParser
7
7
 
8
8
 
9
9
  class Extractor:
10
- def __init__(self, settings: dict):
11
- self.settings = settings
12
- self.source_uri = settings["sourceUri"]
10
+ def __init__(self, extractor_config: ExtractorConfig):
11
+ self.cfg = extractor_config
13
12
 
14
- def resolve_placeholder_variables(self, payload, loader):
13
+ def resolve_placeholder_variables(self, payload: Payload, loader):
15
14
  logging.info(f"Creating runtime variables...")
16
15
  filled = {}
17
- for var in self.settings.get("placeholderVariables", []):
18
- name = var["name"]
19
- var_type = var.get("type", None)
16
+ for placeholder in self.cfg.placeholder_variables:
17
+ name = placeholder.name
20
18
 
21
- if var_type == "runTime":
19
+ if placeholder.type == "runTime":
22
20
  logging.info(f"Resolving runtime var: {name}")
23
- if var.get("runTimeValue", None) == "latestObservationTimestamp":
21
+ if placeholder.run_time_value == "latestObservationTimestamp":
24
22
  value = loader.earliest_begin_date(payload)
25
- elif var.get("runTimeValue", None) == "jobExecutionTime":
23
+ elif placeholder.run_time_value == "jobExecutionTime":
26
24
  value = pd.Timestamp.now(tz="UTC")
27
- elif var_type == "perPayload":
25
+ elif placeholder.type == "perPayload":
28
26
  logging.info(f"Resolving payload var: {name}")
29
- payload_vars = payload.get("extractorVariables", {})
30
- if name not in payload_vars:
27
+ if name not in payload.extractor_variables:
31
28
  raise KeyError(f"Missing per-payload variable '{name}'")
32
- value = payload_vars[name]
29
+ value = payload.extractor_variables[name]
33
30
  else:
34
31
  continue
35
32
 
36
33
  if isinstance(value, (datetime, pd.Timestamp)):
37
- parser = TimestampParser(var["timestamp"])
34
+ parser = TimestampParser(placeholder.timestamp)
38
35
  value = parser.utc_to_string(value)
39
36
 
40
37
  filled[name] = value
41
38
  if not filled:
42
- return self.source_uri
39
+ return self.cfg.source_uri
43
40
  return self.format_uri(filled)
44
41
 
45
42
  def format_uri(self, placeholder_variables):
46
43
  try:
47
- uri = self.source_uri.format(**placeholder_variables)
44
+ uri = self.cfg.source_uri.format(**placeholder_variables)
48
45
  except KeyError as e:
49
46
  missing_key = e.args[0]
50
47
  raise KeyError(f"Missing placeholder variable: {missing_key}")
@@ -1,14 +1,16 @@
1
1
  import logging
2
2
  import requests
3
3
  from io import BytesIO
4
- from .base import Extractor
4
+
5
+ from ..etl_configuration import Payload
6
+ from .base import Extractor, ExtractorConfig
5
7
 
6
8
 
7
9
  class HTTPExtractor(Extractor):
8
- def __init__(self, settings: dict):
10
+ def __init__(self, settings: ExtractorConfig):
9
11
  super().__init__(settings)
10
12
 
11
- def extract(self, payload, loader=None):
13
+ def extract(self, payload: Payload, loader=None):
12
14
  """
13
15
  Downloads the file from the HTTP/HTTPS server and returns a file-like object.
14
16
  """
@@ -0,0 +1,20 @@
1
+ import logging
2
+ from .base import Extractor
3
+ from ..etl_configuration import ExtractorConfig
4
+
5
+
6
+ class LocalFileExtractor(Extractor):
7
+ def __init__(self, extractor_config: ExtractorConfig):
8
+ super().__init__(extractor_config)
9
+
10
+ def extract(self):
11
+ """
12
+ Opens the file and returns a file-like object.
13
+ """
14
+ try:
15
+ file_handle = open(self.cfg.source_uri, "r")
16
+ logging.info(f"Successfully opened file '{self.cfg.source_uri}'.")
17
+ return file_handle
18
+ except Exception as e:
19
+ logging.error(f"Error opening file '{self.cfg.source_uri}': {e}")
20
+ return None
@@ -0,0 +1,23 @@
1
+ from .extractors import HTTPExtractor, LocalFileExtractor
2
+ from .transformers import JSONTransformer, CSVTransformer
3
+ from .loaders import HydroServerLoader
4
+ from .etl_configuration import ExtractorConfig, TransformerConfig, LoaderConfig
5
+
6
+ EXTRACTORS = {"HTTP": HTTPExtractor, "local": LocalFileExtractor}
7
+ TRANSFORMERS = {"JSON": JSONTransformer, "CSV": CSVTransformer}
8
+ LOADERS = {"HydroServer": HydroServerLoader}
9
+
10
+
11
+ def extractor_factory(settings: ExtractorConfig):
12
+ cls = EXTRACTORS[settings.type]
13
+ return cls(settings)
14
+
15
+
16
+ def transformer_factory(settings: TransformerConfig):
17
+ cls = TRANSFORMERS[settings.type]
18
+ return cls(settings)
19
+
20
+
21
+ def loader_factory(settings: LoaderConfig, auth_context, data_source_id: str):
22
+ cls = LOADERS[settings.type]
23
+ return cls(auth_context, data_source_id)
@@ -0,0 +1,4 @@
1
+ from .base import Loader
2
+ from .hydroserver_loader import HydroServerLoader
3
+
4
+ __all__ = ["Loader", "HydroServerLoader"]
@@ -1,6 +1,4 @@
1
1
  from abc import ABC, abstractmethod
2
- from typing import Dict
3
- import pandas as pd
4
2
 
5
3
 
6
4
  class Loader(ABC):